source("scr/libraries.R")
source("scr/ggplot.R")
source("scr/VCFfilterstats.R")
source("scr/HaplotypR.R")
source("scr/xtrafunctions.R")
source("scr/genind.R")

Read mapping

Map reads

Any renaming of files needs to happen before fastq-files are mapped. The population designation (before underscore) is used by FreeBayes to call SNPs so it is important that population designation make biological sense.

Values chosen for reduced representation reference:

  • c = 0.8
  • K1 = 5
  • K2 = 2

Transfer copy of reference.fasta and associated files Reference constructed using c = 0.8, K1 = 5, and K2 = 2. into each directory containing demultiplexed and quality trimmed data/SEQ

Run dDocent from within each Library directory to map reads to reference.fasta.

Write configuration file to run dDocent in each reference directory.

Execute dDocent from within each reference folder to initiate a run to map individuals to reduced representation reference and call SNPs.

QA/QC read mapping

Query mapping statistics

During the mapping stage, dDocent calls BWA to map reads from the individuals in the folder to the generated reduced representation reference and create a -RG.bam-file for each individual. The second column of a BAM (or SAM) file contains FLAGs with binary encoded information on mapping quality, pairedness etc. that can be used to compare the mapping efficiency to the reduced representation reference.

The number of reads mapped can be counted using samtools idxstats <aln-RG.bam> which retrieves stats from the bam-file. The output is TAB-delimited file with each line consisting of the reference sequence name, sequence length, # mapped reads and # unmapped (empty) reads. In addition, samtools can also be be used to query samtools flagstat file.bam which returns an output containing the number of reads for which each flag is true.

dDocent creates a file called bamlist.list containing all the bam files that were generated during read mapping in dDocent using BWA. Write script to gather flagstats from all bam-files.

Run flagstats.

Write script to gather idxstats from all bam-files.

Run indxstats.

Format stats files into tidy data sets

Appending the file results in the information per individual being printed in a new set of row being appended to the file, i.e. there will be as many rows for a given locus as individuals were mapped. The file can be re-formatted and summary statistics calculated using dplyr and tidyr.

Format idxstats:


# create vectors of files to be imported, reference codes, K1 and K2, dataframe names
filenames <- list.files(path = "data/SEQ", pattern = "*.idxstats")
names <- substr(filenames, 1, 8)
lib <- substr(filenames, 1, 4)

# import data
for (i in names){
  filepath <- file.path("data/SEQ", paste(i, 'stats', sep =""))
  assign(i, read.table(filepath, sep = "", header = FALSE,
                     col.names = c("Locus", "Length", "Reads_Mapped", 'blank')) %>%
           select(1:3))
  }

# # make sure to delete old list if rerunning the code
rm(dflist_idx)
rm(MapStats.idx)

# Create list of one dataframe per idxstats file and group by locus
dflist_idx <- lapply(ls(pattern = "*.idx"), get)

for (df in 1:length(dflist_idx)){
  x <- dflist_idx[[df]]
  x[['Locus']] <- as.character(x[['Locus']])
  x = x %>% group_by(Locus)
  dflist_idx[[df]] <- x
}

# Create new dataframes with summary stats per library and bind into final output/dataframe
MapStats.idx <- data.frame()

for (df in 1:length(dflist_idx)){
    
  x = summarize(dflist_idx[[df]],
                Length = mean(Length),
                Mean_Mapped = mean(Reads_Mapped),
                Sum_Mapped = sum(Reads_Mapped),
                Min_Mapped = min(Reads_Mapped),
                Max_Mapped = max(Reads_Mapped),
                SD_Mapped = sd(Reads_Mapped))
  x[x == 0] <- NA

  temp <- summarize(x, Mean_Mapped_Non0 = mean(Mean_Mapped, na.rm = TRUE)) %>%
    mutate(Lib = lib[df],
           Not_Mapped = nrow(filter(x, is.na(Sum_Mapped))),
           N_Loci_Ref = nrow(x)) %>%
    select(Lib, N_Loci_Ref, Not_Mapped, Mean_Mapped_Non0)

  MapStats.idx <- bind_rows(MapStats.idx, temp)
}

MapStats.idx <- MapStats.idx %>%
  mutate(PROP_EMPTY = round((Not_Mapped/N_Loci_Ref)*100, digits = 2),
         CONTIGS_MAPPED = N_Loci_Ref - Not_Mapped)

write.table(MapStats.idx, "results/MapStats.idx", quote = FALSE)

# remove large (duplicate) files
rm(BMA1.idx)
rm(BMA2.idx)
rm(BMA3.idx)

MapStats.idx

Format flagstats:


# Files to be imported
filenames <- list.files(path='data/SEQ', pattern = '*.flagstats')

# create vectors of files to be imported
names <- substr(filenames, 1, 9)
lib <- substr(filenames, 1, 4)

# import data
for (i in names){
  filepath <- file.path('data/SEQ', paste(i, 'stats', sep =""))
  assign(i, read.csv(filepath, sep = "+", header = FALSE,
                     col.names = c("N_Reads", "CAT"),
                     stringsAsFactors = FALSE) %>%
           select(1:2))
}

# Create list of one dataframe per flagstats file and create tidy data set
# should be 3 elements/libraries
rm(dflist_flag)

dflist_flag <- lapply(ls(pattern = "*flag"), get)

# Change N_Reads to numeric
for (df in 1:length(dflist_flag)){
  x <- dflist_flag[[df]]
  x[['N_Reads']] <- as.numeric(x[['N_Reads']])
  dflist_flag[[df]] <- x
}

for (df in 1:length(dflist_flag)){
  x <- dflist_flag[[df]]
  
  n <- nrow(x)/14

  x <- x %>%
    filter(grepl("0 mapped|properly paired|mapQ>=5", CAT)) %>%
    mutate(MAPSTAT = ifelse(grepl("mapQ>=5", CAT), "Mismatch",
                   ifelse(grepl("properly", CAT), "Prop_Paired", "Mapped"))) %>%
    mutate(Ind = c(rep(1:n, each = 3))) %>% 
    # not sure if extra individual in there somehow
    select(4, 3, 1) %>%
    spread(MAPSTAT, N_Reads)

  dflist_flag[[df]] <- x
}

# Create new dataframes with summary stats and add to main final data frame
MapStats.flag <- data.frame()

for (df in 1:length(dflist_flag)){
  x = summarize(dflist_flag[[df]], Sum_Mapped = sum(Mapped),
                             Reads_Mapped = mean(Mapped),
                             Sum_Paired = sum(Prop_Paired),
                             Mean_Paired = mean(Prop_Paired),
                             Sum_Mismatch = sum(Mismatch),
                             Mean_Mismatch = mean(Mismatch)) %>%
  mutate(Lib = lib[df]) %>%
  select(7, 1:6)
  MapStats.flag <- bind_rows(MapStats.flag, x)
}

# write to file
write.table(MapStats.flag, "results/MapStats.flag", quote = FALSE)

MapStats.flag

# combine files
mapstats <- left_join(MapStats.idx, MapStats.flag) %>%
  mutate(PROP_MISMATCH = Sum_Mismatch/Sum_Mapped)

# write summary stats file
write.table(mapstats, file = "results/BWA_mapping.stats", quote = FALSE, sep = " ")

SNP calling

Transfer copy of reference.fasta and associated files into SNP calling directory.

Create symlinks from all fq, bam and bam.bai-files for each separately mapped library in SNP_Calling folder.

Execute dDocent from within SNP_Calling-folder to call variants across all individuals (all libraries).

File TotalrawSNPs.vcf contains all raw SNP/INDEL calls. Do not need to keep links of fq.gz-, bam-, .bam.bai-files after SNPs have been called. Copy TotalRaSNPs.vcf to VCF for SNP filtering.

SNP filtering

dDocent uses FreeBayes to call SNPs and write a VCF-file TotalrawSNPs.vcf. This data set was filtered to remove low quality and artefactual SNP sites, paralogs and low quality individuals based on levels of missing data, minimum/maximum read depth, genotype call rate, and minor allele frequencies. Contigs may contain more than one SNP; the script rad_haplotyper.pl was used to create haplotypes for each locus.

Raw data

Compare Individual & SNP stats

Use VCFtools to create stats files for depth, missing data, heterozygosity and site quality for the raw data.

Compare raw stats.


# load stats files ----
ind_stats_raw <- read.ind.stats(dir = "data/VCF", vcf = "BMA_raw") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_raw <- read.loc.stats(dir = "data/VCF/", vcf = "BMA_raw")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_raw, aes(x = MISS_BMA_raw)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.5),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_raw, aes(x = MEAN_DEPTH_BMA_raw)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_raw, aes(x = MEAN_DEPTH_BMA_raw, y = MISS_BMA_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.5),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot Fis per indv ----
p4 <- ggplot(ind_stats_raw, aes(x = Fis_BMA_raw)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Fis_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv") +
  theme_standard

# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_raw, aes(x = Fis_BMA_raw, y = MISS_BMA_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.5),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "% missing data") +
  theme_standard

# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_raw, aes(x = Fis_BMA_raw, y = MEAN_DEPTH_BMA_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "mean depth per indv") +
  theme_standard


# plot distribution missing data per locus ----
p7 <- ggplot(loc_stats_raw, aes(x = MISS_BMA_raw)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_raw, aes(x = MEAN_DEPTH_BMA_raw)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_raw, aes(x = MEAN_DEPTH_BMA_raw, y = MISS_BMA_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/BMA_raw.lqual", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(PROB = 10^(-QUAL/10))

temp <- data.frame(loc_stats_raw$MEAN_DEPTH_BMA_raw, site_qual$QUAL) %>%
  rename(depth = loc_stats_raw.MEAN_DEPTH_BMA_raw, qual = site_qual.QUAL)

p10 <- ggplot(temp, aes(x = depth, y = qual)) +
  geom_point(size = 1) +
  geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "SNP quality") +
  theme_standard

# plot number of SNPs per contig vs. mean depth ----
temp <- loc_stats_raw %>%
  count(CHR)

p11 <- left_join(temp, loc_stats_raw) %>%
  ggplot() +
  geom_point(aes(x = n, y = MEAN_DEPTH_BMA_raw)) +
  labs(x = "number of SNPs per contig", y = "mean depth") +
  theme_standard

# plot no of SNPs per locus ----
p12 <- loc_stats_raw %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  labs(x = "number of SNPs per locus") +
  theme_standard

mraw <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)

Choose threshold values for quality score, coverage, missing data, minor alleles and mapping/variant calling artifacts

Filter 0: Remove LQ individuals

Remove (known) low quality individuals from data set:

Identify low quality individuals to remove from the data set; defined as individuals with a mean coverage of five or less reads across all loci, with more than 31% missing data, or Fis > (thresholds based on exploratory filtering).

[1] 18

Remove low quality individuals and decompose indels.

Compare stats:

Data set contains 406 individuals and 1311068 loci after removing low quality individuals and decomposing indels set into SNPs.

Import singletons and genotype depth file to create list of loci to exclude based on depth.

Data set contains nrow(singletons) singletons.

Of those nrow(doubletons) are called as a homozygote in one individual.

The data set contains nrow(contigs_total) contigs, nrow(contigs_singletons) (2round( (nrow(contigs_singletons)/nrow(contigs_total)*100), digits = 2)%) contain singletons.

Target coverage is 20 reads per locus; flag singletons with depth < 10 reads and doubletons with depth < 20 reads.

Distriubtion of genotype depth per singleton/doubleton.

Quantiles distribution of read depth for SNP loci called in only one individual.

 5% 25% 50% 75% 95% 99% 
  1   3   5  10  32 139 

Distribution of doubletons (homzygous genotype for that individual) and singletons (heterozygote genotype).

Quantiles distribution number of singletons called in one individual.

     1%      5%     25%     50%     75%     99% 
  46.00   56.00   88.75  327.50  534.25 1815.90 

Compare the number of SNPs vs complex variants (INDELs).

Filter 1: Confidence in SNP call

The QUAL column of a VCF file is a phred based score indicating the probability that the variant shown in the ALT column is wrong. Given the Phred quality score (Q), and the probability that a base is incorrectly called (P), Q = -10(Log10P). A quality score of 20 indicates, a 1 in 100 chance that the SNP site has been called incorrectly (i.e. 99% probability that correct call).

Filter loci with quality score < 20 and singletons/doubletons with low read depth (<10/20 reads). Code genotypes with genotype call quality < 20 or genotype depth < 5 as missing.

Compare stats post-filtering.

Data set contains 406 individuals and 307278 SNP sites.

Removing low confidence SNP loci and genotype calls results in a reduction in the number of the (maximum) SNPs per locus.

Coding genotypes with low read depth (< 5) as missing, results in an overall increase in missing data per locus and a shift in more individuals with more missing data - this is because individuals (and loci) with coverage issues are now characterized by higher missing data.

Compare depth individuals after removing LQ SNP loci and genotypes.

Filter 2: Genotype call rate and allowed missing data per indv

Remove loci with genotype call rate < 50% and minimum mean depth < 15 reads across all individuals.


vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/temp/BMA.F2a --max-missing 0.5 --min-meanDP 15 --recode --recode-INFO-all

# library BMA-1
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/BMA1 --keep data/VCF/BMA1.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA1.recode.vcf --out data/VCF/BMA1 --missing-site

# library BMA-2
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/BMA2 --keep data/VCF/BMA2.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA2.recode.vcf --out data/VCF/BMA2 --missing-site

# library BMA-3
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/BMA3 --keep data/VCF/BMA3.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA3.recode.vcf --out data/VCF/BMA3 --missing-site

# Campeche
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/CAMP --keep data/VCF/CAMP.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CAMP.recode.vcf --out data/VCF/CAMP --missing-site

# Chandeleur Sound
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/CS --keep data/VCF/CS.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CS.recode.vcf --out data/VCF/CS --missing-site

# Corpus Christi Bay
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/CC --keep data/VCF/CC.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CC.recode.vcf --out data/VCF/CC --missing-site

# Florida Atlantic
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/FLA --keep data/VCF/FLA.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/FLA.recode.vcf --out data/VCF/FLA --missing-site

# Florida Gulf South
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/FLGS --keep data/VCF/FLGS.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/FLGS.recode.vcf --out data/VCF/FLGS --missing-site

# Florida Gulf North
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/FLGN --keep data/VCF/FLGN.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/FLGN.recode.vcf --out data/VCF/FLGN --missing-site

# Louisiana
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/LA --keep data/VCF/LA.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/LA.recode.vcf --out data/VCF/LA --missing-site

# Mississippi Sound
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/MISS --keep data/VCF/MISS.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/MISS.recode.vcf --out data/VCF/MISS --missing-site

# Mobile Bay
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/MB --keep data/VCF/MB.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/MB.recode.vcf --out data/VCF/MB --missing-site

Compare distribution of missing data per locus per library.

Flag loci that were not called in > 50% of individuals in a given library.

A total of

loci were called in less than 50% of individuals in one or more libraries. Loci being inconsistently called between libraries can result in library effects.

Compare distribution of missing data per locus per sample location.

Flag loci that were not called in > 75% of individuals at a given sample location.

Remove loci that were not consistently called across all libraries and sample locations.

Identify individuals with > 25% missing data

Remove flagged individuals.

Analyze stats post-filtering:

Filter 3: Filter loci and individuals based on depth, variance in depth and genotype call rate

Determine mean depth and variance per locus per library.

Compare distribution of depth coverage per locus per library. Identify loci that do not have consistent coverage between libraries (can lead to library effects), and/or across individuals.

Identify loci with large variance in mean depth across libraries and/or individuals by calculating the coefficient of variance (STD/MEAN).

Compare mean across all individual to mean weighted by library and coefficient of variance of read depth across individuals and between libraries:

If loci have consistent coverage across loci the mean read depth per locus across all individuals and weighted by library should fall on the red diagonal.

Flag loci that have high variation in mean coverage between individuals and between libraries.

Remove loci flagged for high variance in depth across all individuals and between libraries (removes library effects), filter loci with mean depth < 20 and genotype call rate < 75%

Compare mean depth and number of sites called per individual.

Use genotype depth file to identify individuals with high variance in read depth across loci.

Compare variability of depth within individuals.


p1 <- ggplot(idepth, aes(x = MAX, y = TOTAL)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p2 <- ggplot(idepth, aes(x = TOTAL, y = VAR)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p3 <- ggplot(idepth, aes(MAX, MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p4 <- ggplot(idepth, aes(x = MEAN, y = MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p5 <- ggplot(idepth, aes(x = COEFF_VAR, y = RATIO_MEAN_MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p6 <- ggplot(idepth, aes(MEAN)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 15, color = "darkred", linetype = "dashed") +
  theme_standard

p7 <- ggplot(idepth, aes(MEDIAN)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 10, color = "darkred", linetype = "dashed") +
  theme_standard

p8 <- ggplot(idepth, aes(RATIO_MEAN_MEDIAN)) +
  geom_histogram(binwidth = 0.1, color = "black", fill = "darkorange") +
  theme_standard

p9 <- ggplot(idepth, aes(COEFF_VAR)) +
  geom_histogram(binwidth = 0.025, color = "black", fill = "darkorange") +
  theme_standard

p10 <- ggplot(idepth, aes(x = COEFF_VAR, y = MISS_BMA.F2)) +
  geom_point(shape = 1) +
  geom_hline(yintercept = 0.25, linetype = "dashed", color = "darkred", size = 0.75) +
  theme_standard


m3b <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, cols=2)

Flag LQ individuals.

Remove flagged loci and individuals.

Compare stats post-filtering:


# load stats files ----
ind_stats_F3 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F3") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F3 <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.F3")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_F3, aes(x = MISS_BMA.F3)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_F3, aes(x = MEAN_DEPTH_BMA.F3)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_F3, aes(x = MEAN_DEPTH_BMA.F3, y = MISS_BMA.F3)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot distribution missing data per locus ----
p4 <- ggplot(loc_stats_F3, aes(x = MISS_BMA.F3)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p5 <- ggplot(loc_stats_F3, aes(x = MEAN_DEPTH_BMA.F3)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p6 <- ggplot(loc_stats_F3, aes(x = MEAN_DEPTH_BMA.F3, y = MISS_BMA.F3)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

m3 <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)

Filter 4: Allele balance

AB: Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous

Allele balance is the ratio of reads for reference allele to all reads, considering only reads from individuals called as heterozygous. Values range from 0 - 1; allele balance (for real loci) should be approx. 0.5. Filter SNPs for which the with allele balance < 0.25 and > 0.75.

Filter contigs with SNP calls with AB > 0.25, AB > 0.75; retain loci very close to 0 (retain loci that are fixed variants). Remove genotypes if the quality sum of the reference or alternate allele was 0.

Remaining SNPs: 52,610.

Filter 7: Strand balance

SRF: Number of reference observations on the forward strand SAF: Number of alternate observations on the forward strand SRR: Number of reference observations on the reverse strand SAR: Number of alternate observations on the reverse strand

Paired end reads should not overlap, and a SNP site should only be covered by either the forward or reverse read (strand).

Remove SNP sites that have > 100x more forward alternate reads than reverse alternate reads and > 100x more forward reverse reads than reverse alternate reads.

Number of SNPs remaining: 26,722.

Filter 8: Properly paired status

PAIRED: Proportion of observed alternate alleles which are supported by properly paired read fragments PAIREDR: Proportion of observed reference alleles which are supported by properly paired read fragments

Identify loci with only unpaired reads mapping to them - an artifact introduced due de novo reference assembly. Compare number of paired reads mapping the reference and the alternate alleles to identify discrepancy in the paired status for reads supporting reference and alternate alleles.

Number of SNPs in data set: 23,778.

Filter 9: Maximum depth & Quality

Identify distribution of depth (based on original data set) to identify loci with excess coverage.

Original number of individuals in data set is 424 (INFO flags in filtered data set are are based on original number of individuals in data set).

Create file with the original site depth and quality score for each site:

Calculate average depth and standard deviation:

Mean depth per locus (across all indivuals) is 3.867019810^{4} and the standard deviation is 4.218984410^{4}.

Filter SNP site with depth > mean depth + 1 standard deviation = 1.230498910^{5} and that have quality scores < 2x the depth at that site and output depth per site.

Compare the distribution of mean depth per site averaged across individuals to determine cut-off value of sites with excessively high depth indicative of paralogs/multicopy loci.

Choose cut-off for maximum mean read depth = 300.

Depth distribution per locus after filtering:

Analyze stats post-filtering:


# load stats files ----
ind_stats_F9 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F9") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F9 <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.F9")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_F9, aes(x = MISS_BMA.F9)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_F9, aes(x = MEAN_DEPTH_BMA.F9)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_F9, aes(x = MEAN_DEPTH_BMA.F9, y = MISS_BMA.F9)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot distribution missing data per locus ----
p4 <- ggplot(loc_stats_F9, aes(x = MISS_BMA.F9)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p5 <- ggplot(loc_stats_F9, aes(x = MEAN_DEPTH_BMA.F9)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p6 <- ggplot(loc_stats_F9, aes(x = MEAN_DEPTH_BMA.F9, y = MISS_BMA.F9)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

m3 <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)

Data set contains 23040 SNP sites and 402 individuals.

Filter 11: Excess heterozygosity

Identify SNPs with more than 0.5 heterozygosity and significant.

Distribution of observed heterozygosity.

Identify loci with heterozygosity > 0.5, then correct p-values for multiple comparisons using Benjamini-Hochberg method.

Remove SNPs with excess heterozygosity and contigs with more than one SNP w/excess heterozygosity.

Visualize stats:

Data set contains 16831 SNP sites and 402 individuals.

Filter 12: Filter LQ individuals

Compare mean depth and number of sites called per individual.

Use genotype depth file to identify individuals with high variance in read depth across loci.

Compare distribution of depth.

Compare distribution of depth of individuals grouped by library.

Compare distribution of genotype depths (across all individuals).

Identify variance in depth w/in individuals.

Compare depth, missing data and individual heterozygosity levels.


imiss <- read_table2("data/VCF/BMA.F11.imiss") %>%
  select(INDV, N_DATA, F_MISS)

istats <- left_join(idepth, imiss)

Fis <- read_table2("data/VCF/BMA.F11.het") %>%
  select(-N_SITES)

istats <- left_join(istats, Fis)

p1 <- ggplot(istats, aes(x = MEAN, y = F_MISS, fill = `F`)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = 1, option = "viridis",
                     name = 'Fis', discrete = FALSE) +
  theme_standard

p2 <- ggplot(istats, aes(x = COEFF_VAR, y = MEAN, fill = F_MISS)) +
 geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  theme_standard

p3 <- ggplot(istats, aes(x = `O(HOM)`, y = MEAN, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = 'mean read depth', discrete = FALSE) +
  theme_standard

p4 <- ggplot(istats, aes(x = MEAN)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  theme_standard

p5 <- ggplot(istats, aes(x = `F`, y = MEAN, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  theme_standard

p6 <- ggplot(istats, aes(x = `F`, y = COEFF_VAR, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  theme_standard

p7 <- ggplot(istats, aes(x = `F`, y = RATIO_MEAN_MEDIAN, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  scale_y_continuous(limits = c(0.8, 4)) +
  theme_standard

p8 <- ggplot(istats, aes(x = F_MISS)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  theme_standard


m14b <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, cols=2)

Flag low quality individuals

Filter 13: Missing data by sample location


vcftools --vcf data/VCF/temp/BMA.F11.recode.vcf --out data/VCF/temp/BMA.F12 --remove data/VCF/F12_LQ.indv --recode --recode-INFO-all

# Campeche
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/CAMP --keep data/VCF/CAMP.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/CAMP.recode.vcf --out data/VCF/CAMP --missing-site

# Chandeleur Sound
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/CS --keep data/VCF/CS.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/CS.recode.vcf --out data/VCF/CS --missing-site

# Corpus Christi Bay
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/CC --keep data/VCF/CC.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/CC.recode.vcf --out data/VCF/CC --missing-site

# Florida Atlantic
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/FLA --keep data/VCF/FLA.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLA.recode.vcf --out data/VCF/FLA --missing-site

# Florida Gulf South
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/FLGS --keep data/VCF/FLGS.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLGS.recode.vcf --out data/VCF/FLGS --missing-site

# Florida Gulf North 
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/FLGN --keep data/VCF/FLGN.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLGN.recode.vcf --out data/VCF/FLGN --missing-site

# Louisiana
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/LA --keep data/VCF/LA.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/LA.recode.vcf --out data/VCF/LA --missing-site

# Mississippi Sound 
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/MISS --keep data/VCF/MISS.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/MISS.recode.vcf --out data/VCF/MISS --missing-site

# Mobile Bay
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/MB --keep data/VCF/MB.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/MB.recode.vcf --out data/VCF/MB --missing-site

Compare missing data per sample region:

identify loci with high missing data in each sample location (> 15 % missing data in a given location).

Remove loci from data set:

Compare stats:


# load stats files ----
ind_stats_F13 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F13") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE, extra = "merge")

loc_stats_F13 <- read.loc.stats(dir = "data/VCF", vcf = "BMA.F13")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_F13, aes(x = MISS_BMA.F13)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_F13, aes(x = MEAN_DEPTH_BMA.F13)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_F13, aes(x = MEAN_DEPTH_BMA.F13, y = MISS_BMA.F13)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot Fis per indv ----
p4 <- ggplot(ind_stats_F13, aes(x = Fis_BMA.F13)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Fis_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv") +
  theme_standard

# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_F13, aes(x = Fis_BMA.F13, y = MISS_BMA.F13)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "% missing data") +
  theme_standard

# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_F13, aes(x = Fis_BMA.F13, y = MEAN_DEPTH_BMA.F13)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "mean depth per indv") +
  theme_standard


# plot distribution missing data per locus ----
p7 <- ggplot(loc_stats_F13, aes(x = MISS_BMA.F13)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_F13, aes(x = MEAN_DEPTH_BMA.F13)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_F13, aes(x = MEAN_DEPTH_BMA.F13, y = MISS_BMA.F13)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/BMA.F13.lqual", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(PROB = 10^(-QUAL/10))

temp <- data.frame(loc_stats_F13$MEAN_DEPTH_BMA.F13, site_qual$QUAL) %>%
  rename(depth = loc_stats_F13.MEAN_DEPTH_BMA.F13, qual = site_qual.QUAL)

p10 <- ggplot(temp, aes(x = depth, y = qual)) +
  geom_point(size = 1) +
  geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "SNP quality") +
  theme_standard

# plot number of SNPs per contig vs. mean depth ----
temp <- loc_stats_F13 %>%
  count(CHR)

p11 <- left_join(temp, loc_stats_F13) %>%
  ggplot() +
  geom_point(aes(x = n, y = MEAN_DEPTH_BMA.F13)) +
  labs(x = "number of SNPs per contig", y = "mean depth") +
  theme_standard

# plot no of SNPs per locus ----
p12 <- loc_stats_F13 %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  labs(x = "number of SNPs per locus") +
  theme_standard

m15 <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)

Compare number of contigs vs number of SNPs.

[1] 14682
[1] 6048

Mean number of SNPs per contig is 64.1135371.

Final thresholds values for filtered data set BMA:

  • minimum sequence quality (minQ): 20
  • minimum genotype quality (minGQ): 20
  • minimum genotype call rate per locus (geno): 90%
  • minimum genotype depth (minD): 5
  • maximum missing data per individual (missInd): 25%
  • mean minimum depth per locus (m-minD): 15
  • mean maximum depth (m-minD): 300
  • dDocent filters run for allelic balance, mapping quality, strandedness and paired status, depth/quality ratio
  • Indels removed from data set
  • minimum genotype call rate by population: 85%

Data set contains 14682 SNP sites on 229 and 384 individuals.

Haplotyping

Prep for haplotyping

Create list of individuals retained in final vcf file:

Change so that new file is printed into Haplotyping/temp folder

Use BMA-BMA.individuals to create popmap as a tab-separated file of individuals and their population designation, with one individual per line (make sure UNIX format). This file is needed to write the genepop file, if not provided the script will run through the process but not write a genepop file and place into same folder rad_haplotyper.pl will be run from.

Place all necessary files in data/Haplotyping/temp directory (bam, fastq, reference.fasta, vcf-file)

Haplotype filtering

Overview of haplotyping success

Comparison of the number of loci that were filtered due to excess number of missing data or suspected paralogs during the haplotyping process to those that passed. Haplotyer was run without any threshold values (other than default values). The genepop file only contains loci that passed haplotyping stage.

Remove filtered loci from data set.

5957 loci passed haplotyping process.

Data set contains 384 individuals:

Identify threshold values to filter data set

Proportion of individuals haplotyped

Flag loci successfully haplotyped in < 90% of individuals.

136 loci were flagged as genotype call rate < 0.9.

Possible paralogs per locus

Loci are flagged as possible paralogs for an individuals when more than the expected number of haplotypes based on the SNP genotype call (homozygote, heterozygote) are detected.

1 % of individuals: 3.84 5 % of individuals: 19.2

Flag loci that are flagged as potential paralogs in 5 or more individuals. 4

176 loci were flagged as possible paralogs.

Number of SNPs & Haplotypes per locus

Each locus varies in the number of SNPs detected which determines the number of haplotypes expected in that population.

Filtering loci based on number of SNPs contained on that locus could bias the data set as loci with high recombination may be removed. On the other hand, assuming an approximate length of 250 bp loci with more than 25 SNPs would mean that 10% of bases are a polymorphisms.

Identify number of haplotypes loci with > 35 SNP sites.

0 loci were flagged.

Assuming that mutation is the only mechanism resulting in new haplotypes, the maximum expected number of haplotypes per locus is number of SNPs N + 1.


p1 <- ggplot(hap_stats, aes(x = Haplotypes)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Haplotypes, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(Haplotypes, .99, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "# haplotypes per locus", y = "# loci") +
  theme_standard

temp <- hap_stats %>%
  select(Locus, Sites, Haplotypes, Prop_Haplotyped, Low_Cov.Geno_Err, Poss_Paralog) %>%
  mutate(exp_sites = Sites + 1) %>%
  mutate(xtra = Haplotypes - Sites)

p2 <- ggplot(temp, aes(x = Sites, y = Poss_Paralog)) +
  geom_point() +
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# SNP sites per locus", y = "possible paralogs") +
  theme_standard

p3 <- ggplot(temp, aes(x = Sites, y = xtra)) +
  geom_point() +
  geom_hline(yintercept = 10, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# SNP sites per locus", y = "# extra haplotypes") +
  theme_standard

p4 <- ggplot(temp, aes(x = xtra, y = Prop_Haplotyped)) +
  geom_point(size = 1) +
  geom_hline(yintercept = 0.9, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# extra haplotypes", y = "proportion of indv haplotyped") +
  theme_standard

p5 <- ggplot(temp, aes(x = xtra, y = Low_Cov.Geno_Err)) +
  geom_point(size = 1) +
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# extra haplotypes", y = "potential genotyping error") +
  theme_standard

p6 <- ggplot(temp, aes(x = xtra, y = Poss_Paralog)) +
  geom_point(size = 1) +
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# extra haplotypes", y = "possible paralogs") +
  theme_standard

m21 <- multiplot(p1, p2, p3, p4, p5, p6, cols = 2)

Again, removing loci with unexpectedly high numbers of haplotypes may bias the data set, as loci with e.g. high recombination may be removed from the data set.

0 loci were flagged for excessive number of haplotypes compared to the number of SNPs at that locus.

Low Coverage/Genotyping Error

After combining SNPs on the same contig during the haplotyping process, it is possible to observe fewer than the expected number of haplotypes based on the genotype call (heterozygote/homozygote). When this occurs, that genotype is coded as missing. For each locus the number of individuals for which is it flagged as a potential a potential genotyping error or suffering from null alleles due to low coverage detected for a given locus is recorded.

Loci with pronounced patterns of genotyping error likely due to low coverage will have low success rate for genotyping, i.e. they will be caught in missing data filters. Coverage issues are likely genotype specfic; previous filters have targeted loci and individuals with high variance in coverage and suspect genotpes have been coded as missing, i.e. this filter need not be very conservative, regardless, loci that are repeatedly being flagged as problematic should be removed.

183 loci were flagged as potentially affected by genotyping error.

Flag problematic individuals

Loci that are not successfully haplotyped in an individual due to missing data, complex locus, haplotyped genotype is higher/lower than SNP haplotype in a given individual are coded as missing for that individual. Problematic individual can be identified as having a high proportion of missing data.

Individuals with low proportion of successfully haplotyped loci

Problem individuals can be identified by choosing a cut-off value for the minium proportion of sucessfully haplotyped loci and excluding individuals below that threshold value. Removing loci with low haplotyping success with decrease the amount of missing data. Therefore, final missing data cut-offs should be applied after removing those loci from the data set, though it is important to flag potentially problematic individuals based on their haplotyping stats at this point.

Possible paralogs per individual

Individuals with high proportion of loci flagged as possible paralogs should be flagged as potential problem individuals.

Cut-off for individuals with loci flagged paralogs in > 1% of loci is 59.57.

The highest number of flagged loci in an individuals is 81, which is equivalent to 1.36% of loci.

Individuals with high proportion of potential allele dropout/genotyping error

Remove individuals with high proportion of loci that have been flagged as potential allele dropouts/genotyping error.

The highest number of flagged loci in an individuals is 110, which is equivalent to 1.85% of loci.

Filter flagged loci and individuals as necessary

Load genepop file with haplotyped loci.


 Converting data from a Genepop .gen file to a genind object... 


File description:  BMA.gen 

...done.
/// GENIND OBJECT /////////

 // 384 individuals; 5,957 loci; 20,898 alleles; size: 35.6 Mb

 // Basic content
   @tab:  384 x 20898 matrix of allele counts
   @loc.n.all: number of alleles per locus (range: 1-25)
   @loc.fac: locus factor for the 20898 columns of @tab
   @all.names: list of allele names for each locus
   @ploidy: ploidy of each individual  (range: 2-2)
   @type:  codom
   @call: read.genepop(file = "data/Haplotyping/BMA.gen", ncode = 3L, quiet = FALSE)

 // Optional content
   @pop: population of each individual (group size range: 384-384)

Remove flagged loci and individuals.

/// GENIND OBJECT /////////

 // 380 individuals; 5,621 loci; 18,801 alleles; size: 31.8 Mb

 // Basic content
   @tab:  380 x 18801 matrix of allele counts
   @loc.n.all: number of alleles per locus (range: 1-25)
   @loc.fac: locus factor for the 18801 columns of @tab
   @all.names: list of allele names for each locus
   @ploidy: ploidy of each individual  (range: 2-2)
   @type:  codom
   @call: .local(x = x, i = i, j = j, drop = drop)

 // Optional content
   @pop: population of each individual (group size range: 380-380)

Compare duplicate individuals


# extract genotype matrix
df <- genind2df(gen,
                usepop = TRUE,
                sep = ":", oneColPerAll = FALSE) %>%
  select(-pop) %>%
  rownames_to_column()

# create list of duplicates
pairs <- list()

# input each set of duplicates as a vector of a list
pairs[[1]] <- c("BMA_3B-E12_MS-37", "BMA_2A-C01_MS-57")
pairs[[2]] <- c("BMA_1A-C01_CCGN-001", "BMA_1A-D02_CCGN01")
pairs[[3]] <- c("BMA_1A-H04_CC02-35",   "BMA_2B-B09_CC-35")
pairs[[4]] <- c("BMA_3B-H02_MS26", "BMA_3B-D01_MS26")
pairs[[5]] <- c("BMA_2B-H02_MB06", "BMA_1B-E06_MB-06")
pairs[[6]] <- c("BMA_1B-G11_CC-08", "BMA_1A-H03_CC02-08")
pairs[[7]] <- c("BMA_3B-H03_BM027", "BMA_3B-A07_BM027")
pairs[[8]] <- c("BMA_1B-C04_MS-21", "BMA_2A-C05_MS-61")
pairs[[9]] <- c("BMA_2B-H01_CS3", "BMA_1A-B03_CS-03")
pairs[[10]] <- c("BMA_3B-E11_MS-36", "BMA_1B-C12_MS-56")
pairs[[11]] <- c("BMA_2A-C07_MS-63", "BMA_1B-C06_MS-23")
pairs[[12]] <- c("BMA_2B-B01_CC-12", "BMA_3A-A02_CC-12")
pairs[[13]] <- c("BMA_2B-G05_BMAR-03", "BMA_3B-E12_BMAR-02")


# create empty list for genotype error (discordant loci)
genoerror <- list()

# create empty vector for duplicates (retains first indv in pair)
dup <- character()

# identify discordant genotypes for each set of duplicates
for (p in pairs){

# select duplicates from genotype matrix
geno <- df %>%
  filter(rowname %in% p) %>%
  select(-rowname)

# compare genotypes
comp <- (t(geno))

contigs <- as.data.frame(comp) %>%
  rownames_to_column(var = "LOCUS") %>%
  mutate(V1 = as.character(V1),
         V2 = as.character(V2)) %>%
  filter(V1 != V2)

# write vector with first individual in pair
ind <- p[1]

dup <- c(dup, ind)

genoerror[[ind]] <- contigs

}

# if it throws error object V1 or V2 not found it means one or more of the samples names are not correct or no longer in data set after filtering

# combine into one dataframe
genoerror <- ldply(genoerror, data.frame) %>%
  rename(DUP1 = `.id`,
         GENO_INDV1 = V1,
         GENO_INDV2 = V2)

write_delim(genoerror, "results/haplotyped.genoerror", delim = "\t")

Compare genotype depth for discordant loci:

Determine number of times discordant genotype is due to heterozygote/homozygote.

Compare number of discordant genotype calls per duplicate set.

Identify loci consistently affected by genotyping error

Remove flagged loci and one individual per pair.

/// GENIND OBJECT /////////

 // 367 individuals; 5,621 loci; 18,801 alleles; size: 30.9 Mb

 // Basic content
   @tab:  367 x 18801 matrix of allele counts
   @loc.n.all: number of alleles per locus (range: 1-25)
   @loc.fac: locus factor for the 18801 columns of @tab
   @all.names: list of allele names for each locus
   @ploidy: ploidy of each individual  (range: 2-2)
   @type:  codom
   @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)

 // Optional content
   @pop: population of each individual (group size range: 367-367)

12708 loci were removed as potentially affected by genotyping error.

Relatedness & Inbreeding

Create input file fore related package.

Calculate pairwise relatedness according to Lynch & Ritland 1999; uses inbreeding estimates for each individuals to estimate relatedness.

Distribution of levels of pairwise relatedness:

Distribution of levels of homozygosity:

Heterozygosity and HWE

Generate summary statistics

Compare observed (Ho) and expected (He) heterozygosity for all individuals across all populations.

Identify loci that are now monomorphic and remove from data set:

/// GENIND OBJECT /////////

 // 367 individuals; 5,590 loci; 18,768 alleles; size: 30.9 Mb

 // Basic content
   @tab:  367 x 18768 matrix of allele counts
   @loc.n.all: number of alleles per locus (range: 2-25)
   @loc.fac: locus factor for the 18768 columns of @tab
   @all.names: list of allele names for each locus
   @ploidy: ploidy of each individual  (range: 2-2)
   @type:  codom
   @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)

 // Optional content
   @pop: population of each individual (group size range: 20-52)
   @strata: a data frame with 5 columns ( LIB_ID, SP, WELL, SAMPLE_ID, POP )

Expected vs observed heterozygosity across all populations.

Compare distribution of observed heterozygosity (Ho) in each putative population:

Compare distribution of expected heterozygosity He, (within cluster diversity) in each putative population:

Comparison of oberved vs. heterozygosity within each population:

Test for deviations from Hardy-Weinberg equilibrium within each group (POP level):

Format output and view results:

Distribution of number of times loci are out of HWE

Remove loci out of HWE in 5 or more populations:

/// GENIND OBJECT /////////

 // 367 individuals; 5,554 loci; 18,611 alleles; size: 30.7 Mb

 // Basic content
   @tab:  367 x 18611 matrix of allele counts
   @loc.n.all: number of alleles per locus (range: 2-25)
   @loc.fac: locus factor for the 18611 columns of @tab
   @all.names: list of allele names for each locus
   @ploidy: ploidy of each individual  (range: 2-2)
   @type:  codom
   @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)

 // Optional content
   @pop: population of each individual (group size range: 20-52)
   @strata: a data frame with 7 columns ( LIB_ID, SP, WELL, SAMPLE_ID, POP, REGION, ... )

Final data set

Write files with filtered data set:

Write files with individuals and Contigs still contained in data set and use to filter vcf file.

Expected 4 pieces. Missing pieces filled with `NA` in 329 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].Expected 2 pieces. Missing pieces filled with `NA` in 38 rows [10, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 163, 164, 165, 166, 167, 168, 169, 170, 171, ...].
Error in filter(ind_stats_F14, INDV %in% keepind$LIB_ID) : 
  object 'ind_stats_F14' not found

Write vcf file and 012 format:

Compare stats for final filtered data set:


# load stats files ----
ind_stats_fil <- read.ind.stats(dir = "data/VCF", vcf = "BMA.fil") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_fil <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.fil")

View(ind_stats_fil)
View(hap_ind_stats)

# plot missing data per indv ----
p1 <- ggplot(ind_stats_fil, aes(x = MISS_BMA.fil)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot Fis per indv ----
p2 <- ggplot(ind_stats_fil, aes(x = Fis_BMA.fil)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Fis_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv") +
  theme_standard

# plot read depth per indv ----
p3 <- ggplot(ind_stats_fil, aes(x = MEAN_DEPTH_BMA.fil)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p4 <- ggplot(ind_stats_fil, aes(x = MEAN_DEPTH_BMA.fil, y = MISS_BMA.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_fil, aes(x = Fis_BMA.fil, y = MISS_BMA.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "% missing data") +
  theme_standard

# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_fil, aes(x = Fis_BMA.fil, y = MEAN_DEPTH_BMA.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "mean depth per indv") +
  theme_standard

# plot distribution missing data per locus ----
p7 <- ggplot(ind_stats_fil, aes(x = MISS_BMA.fil)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_fil, aes(x = MEAN_DEPTH_BMA.fil)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_fil, aes(x = MEAN_DEPTH_BMA.fil, y = MISS_BMA.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

# plot no of SNPs per locus ----
p10 <- loc_stats_fil %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  labs(x = "number of SNPs per locus") +
  theme_standard

temp <- loc_stats_fil %>%
  count(CHR)

# plot number of SNPs per contig vs. mean depth ----
p11 <- left_join(temp, loc_stats_fil) %>%
  ggplot() +
  geom_point(aes(x = n, y = MEAN_DEPTH_BMA.fil)) +
  labs(x = "number of SNPs per contig", y = "mean depth") +
  theme_standard

# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/BMA.fil.lqual", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(PROB = 10^(-QUAL/10))

temp <- data.frame(loc_stats_fil$MEAN_DEPTH_BMA.fil, site_qual$QUAL) %>%
  rename(depth = loc_stats_fil.MEAN_DEPTH_BMA.fil, qual = site_qual.QUAL)

p12 <- ggplot(temp, aes(x = depth, y = qual)) +
  geom_point(size = 1) +
  geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "SNP quality") +
  theme_standard

mfil <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)

---
title: "Genotyping Gafftopsail catfish data set"
output:
  html_notebook:
    code_folding: hide
    df_print: paged
    highlight: kate
    theme: flatly
    toc: yes
---

```{r load libraries, message=FALSE, warning=FALSE}

source("scr/libraries.R")
source("scr/ggplot.R")
source("scr/VCFfilterstats.R")
source("scr/HaplotypR.R")
source("scr/xtrafunctions.R")
source("scr/genind.R")


pops <- c("FLA", 
          "FLGS", "FLGN", 
          "MB", "MISS", "CS", "LA", 
          "CC", 
          "CAMP")

col_pops <- c("chartreuse4", 
              "dodgerblue4", "steelblue1", 
              "firebrick", "orangered3", "darkorange", "gold",
              "darkslategrey1",
              "purple3")

reg <- c("SWATL", "EGULF", "CGULF", "WGULF", "SGULF")

col_regs <- c("chartreuse4", "dodgerblue4", "orangered3", "darkslategray1", "purple3")

oce <- c("ATL", "GULF")

col_oce <- c("chartreuse4", "darkslategray1")

```

# Read mapping

## Map reads

Any renaming of files needs to happen before `fastq`-files are mapped. The population designation (before underscore) is used by `FreeBayes` to call SNPs so it is important that population designation make biological sense.

Values chosen for reduced representation reference:

* **c** = 0.8
* **K1** = 5
* **K2** = 2

Transfer copy of `reference.fasta` and associated files Reference constructed using *c = 0.8*, *K1 = 5*, and *K2 = 2*. into each directory containing demultiplexed and quality trimmed data/SEQ

Run `dDocent` from within each Library directory to map reads to `reference.fasta`.

```{bash}

# go to reference parent directory
cd /home/soleary/GAFFTOPS/BMA_POPGEN/data/SEQ/

# loop over all reference creation folders
for d in BMA*

do

    (cp /home/soleary/GAFFTOPS/BMA_POPGEN/data/REF/REF5_2/reference.* $d/)
    
done

```

Write configuration file to run dDocent in each reference directory.

```{r}

# parameters for shell script ----

# correct number of individuals
ind <- "yes"

# number of processors
processors <- "25"

# quality trim needed
trim <- "no"

# perform reference assembly
assembly <- "no"

# read mapping required
map <- "yes"

# custom values for mapping
map_param <- "yes"

# match score
A <- "1"

# mismatch score
B <- "3"

# gap penalty
O <- "5"

# SNP calling required
snp <- "no"

# email address
email <- "shannon.j.oleary@gmail.com"

# define paths/directories ----

# path to parent directory for references
path <- "/home/soleary/GAFFTOPS/BMA_POPGEN/data/SEQ"

# sequence folders
dir <- c("BMA-1", "BMA-2", "BMA-3")

# write shell scripts in each directory ----
for(d in dir){
  
  # write shell script
  p <- file.path(path, d, "dDocent_map.sh")
  
  con <- file(p, open = "wt")
  
  writeLines(glue("dDocent <<Over! 
                  {ind} 
                  {processors} 
                  {trim} 
                  {assembly} 
                  {map}  
                  {map_param} 
                  {A} 
                  {B} 
                  {O} 
                  {snp} 
                  {email} 
                  Over!"), con)
  
  close(con)
  
}

```

Execute `dDocent` from within each reference folder to initiate a run to map individuals to reduced representation reference and call SNPs.

```{bash}

# go to sequence parent directory
cd /home/soleary/GAFFTOPS/BMA_POPGEN/data/SEQ/

# loop over all reference creation folders
for d in BMA-*

do

    (cd "$d" && chmod 755 dDocent_map.sh && ./dDocent_map.sh)
    
done

```

## QA/QC read mapping

### Query mapping statistics

During the mapping stage, `dDocent` calls `BWA` to map reads from the individuals in the folder to the generated reduced representation reference and create a `-RG.bam`-file for each individual. The second column of a BAM (or SAM) file contains FLAGs with binary encoded information on mapping quality, pairedness etc. that can be used to compare the mapping efficiency to the reduced representation reference.

The number of reads mapped can be counted using `samtools idxstats <aln-RG.bam>` which retrieves stats from the bam-file. The output is TAB-delimited file with each line consisting of the reference sequence name, sequence length, # mapped reads and # unmapped (empty) reads. In addition, `samtools` can also be be used to query `samtools flagstat file.bam` which returns an output containing the number of reads for which each flag is true.

dDocent creates a file called `bamlist.list` containing all the bam files that were generated during read mapping in `dDocent` using `BWA`. Write script to gather flagstats from all `bam`-files.

```{r flagstats script}

# Sequence folders
SEQ <- c("BMA-1", "BMA-2", "BMA-3")

# write script to gather flagstats
l <- list()

for (s in SEQ){
  
  l[[s]] <- read_table2(paste("data/SEQ/", s, "/bamlist.list", sep =""), col_names = "BAM") %>%
    mutate(PATH = paste("data/SEQ/", s, "/", sep = ""),
           COMMAND = "samtools flagstat",
           OUT = paste(">> data/SEQ/", s, ".flagstats", sep = "")) %>%
    select( COMMAND, PATH, BAM, OUT) %>%
    unite(FILE, 2:3, sep = "")

}

bam <- ldply(l, data.frame) %>%
  select(-`.id`)

write_delim(bam, "scr/flagstats.sh", delim = "\t", col_names = FALSE)

```

Run flagstats.

```{bash run flagstats, eval=FALSE, include=FALSE}

chmod 755 scr/flagstats.sh
./scr/flagstats.sh

```

Write script to gather idxstats from all `bam`-files.

```{r script idxstats}

# write script to gather idxstats
l <- list()

l[["BMA1"]] <- read_table2("data/SEQ/BMA-1/bamlist.list", col_names = "BAM") %>%
  mutate(PATH = "data/SEQ/BMA-1/",
         COMMAND = "samtools idxstats",
         OUT = ">> data/SEQ/BMA1.idxstats") %>%
  select( COMMAND, PATH, BAM, OUT) %>%
  unite(FILE, 2:3, sep = "")

l[["BMA2"]] <- read_table2("data/SEQ/BMA-2/bamlist.list", col_names = "BAM") %>%
  mutate(PATH = "data/SEQ/BMA-2/",
         COMMAND = "samtools idxstats",
         OUT = ">> data/SEQ/BMA2.idxstats") %>%
  select( COMMAND, PATH, BAM, OUT) %>%
  unite(FILE, 2:3, sep = "")

l[["BMA3"]] <- read_table2("data/SEQ/BMA-3/bamlist.list", col_names = "BAM") %>%
  mutate(PATH = "data/SEQ/BMA-3/",
         COMMAND = "samtools idxstats",
         OUT = ">> data/SEQ/BMA3.idxstats") %>%
  select( COMMAND, PATH, BAM, OUT) %>%
  unite(FILE, 2:3, sep = "")

bam <- ldply(l, data.frame) %>%
  select(-`.id`)

write_delim(bam, "scr/idxstats.sh", delim = "\t", col_names = FALSE)

```

Run indxstats.

```{bash run idxstats, eval=FALSE, include=FALSE}

chmod 755 scr/idxstats.sh
./scr/idxstats.sh

```

### Format stats files into tidy data sets

Appending the file results in the information per individual being printed in a new set of row being appended to the file, i.e. there will be as many rows for a given locus as individuals were mapped. The file can be re-formatted and summary statistics calculated using dplyr and tidyr.

Format idxstats:

```{r format idxstats}

# create vectors of files to be imported, reference codes, K1 and K2, dataframe names
filenames <- list.files(path = "data/SEQ", pattern = "*.idxstats")
names <- substr(filenames, 1, 8)
lib <- substr(filenames, 1, 4)

# import data
for (i in names){
  filepath <- file.path("data/SEQ", paste(i, 'stats', sep =""))
  assign(i, read.table(filepath, sep = "", header = FALSE,
                     col.names = c("Locus", "Length", "Reads_Mapped", 'blank')) %>%
           select(1:3))
  }

# # make sure to delete old list if rerunning the code
rm(dflist_idx)
rm(MapStats.idx)

# Create list of one dataframe per idxstats file and group by locus
dflist_idx <- lapply(ls(pattern = "*.idx"), get)

for (df in 1:length(dflist_idx)){
  x <- dflist_idx[[df]]
  x[['Locus']] <- as.character(x[['Locus']])
  x = x %>% group_by(Locus)
  dflist_idx[[df]] <- x
}

# Create new dataframes with summary stats per library and bind into final output/dataframe
MapStats.idx <- data.frame()

for (df in 1:length(dflist_idx)){
    
  x = summarize(dflist_idx[[df]],
                Length = mean(Length),
                Mean_Mapped = mean(Reads_Mapped),
                Sum_Mapped = sum(Reads_Mapped),
                Min_Mapped = min(Reads_Mapped),
                Max_Mapped = max(Reads_Mapped),
                SD_Mapped = sd(Reads_Mapped))
  x[x == 0] <- NA

  temp <- summarize(x, Mean_Mapped_Non0 = mean(Mean_Mapped, na.rm = TRUE)) %>%
    mutate(Lib = lib[df],
           Not_Mapped = nrow(filter(x, is.na(Sum_Mapped))),
           N_Loci_Ref = nrow(x)) %>%
    select(Lib, N_Loci_Ref, Not_Mapped, Mean_Mapped_Non0)

  MapStats.idx <- bind_rows(MapStats.idx, temp)
}

MapStats.idx <- MapStats.idx %>%
  mutate(PROP_EMPTY = round((Not_Mapped/N_Loci_Ref)*100, digits = 2),
         CONTIGS_MAPPED = N_Loci_Ref - Not_Mapped)

write.table(MapStats.idx, "results/MapStats.idx", quote = FALSE)

# remove large (duplicate) files
rm(BMA1.idx)
rm(BMA2.idx)
rm(BMA3.idx)

MapStats.idx

```

Format flagstats:

```{r format flagstats}

# Files to be imported
filenames <- list.files(path='data/SEQ', pattern = '*.flagstats')

# create vectors of files to be imported
names <- substr(filenames, 1, 9)
lib <- substr(filenames, 1, 4)

# import data
for (i in names){
  filepath <- file.path('data/SEQ', paste(i, 'stats', sep =""))
  assign(i, read.csv(filepath, sep = "+", header = FALSE,
                     col.names = c("N_Reads", "CAT"),
                     stringsAsFactors = FALSE) %>%
           select(1:2))
}

# Create list of one dataframe per flagstats file and create tidy data set
# should be 3 elements/libraries
rm(dflist_flag)

dflist_flag <- lapply(ls(pattern = "*flag"), get)

# Change N_Reads to numeric
for (df in 1:length(dflist_flag)){
  x <- dflist_flag[[df]]
  x[['N_Reads']] <- as.numeric(x[['N_Reads']])
  dflist_flag[[df]] <- x
}

for (df in 1:length(dflist_flag)){
  x <- dflist_flag[[df]]
  
  n <- nrow(x)/14

  x <- x %>%
    filter(grepl("0 mapped|properly paired|mapQ>=5", CAT)) %>%
    mutate(MAPSTAT = ifelse(grepl("mapQ>=5", CAT), "Mismatch",
                   ifelse(grepl("properly", CAT), "Prop_Paired", "Mapped"))) %>%
    mutate(Ind = c(rep(1:n, each = 3))) %>% 
    # not sure if extra individual in there somehow
    select(4, 3, 1) %>%
    spread(MAPSTAT, N_Reads)

  dflist_flag[[df]] <- x
}

# Create new dataframes with summary stats and add to main final data frame
MapStats.flag <- data.frame()

for (df in 1:length(dflist_flag)){
  x = summarize(dflist_flag[[df]], Sum_Mapped = sum(Mapped),
                             Reads_Mapped = mean(Mapped),
                             Sum_Paired = sum(Prop_Paired),
                             Mean_Paired = mean(Prop_Paired),
                             Sum_Mismatch = sum(Mismatch),
                             Mean_Mismatch = mean(Mismatch)) %>%
  mutate(Lib = lib[df]) %>%
  select(7, 1:6)
  MapStats.flag <- bind_rows(MapStats.flag, x)
}

# write to file
write.table(MapStats.flag, "results/MapStats.flag", quote = FALSE)

MapStats.flag

# combine files
mapstats <- left_join(MapStats.idx, MapStats.flag) %>%
  mutate(PROP_MISMATCH = Sum_Mismatch/Sum_Mapped)

# write summary stats file
write.table(mapstats, file = "results/BWA_mapping.stats", quote = FALSE, sep = " ")

```

### Evaluate & compare mapping results

Compare number of reference contigs for which no reads mapped to per library.

```{r plot mapstats, fig.height=4, fig.width=12, message=FALSE, warning=FALSE}

mapstats <- read_delim("results/BWA_mapping.stats", delim = " ")

# plot no of loci vs "empty" loci
p1 <- ggplot(mapstats, aes(x = Lib, y = CONTIGS_MAPPED)) +
  geom_bar(stat = "identity", color = "black", fill = "darkorange") +
  scale_y_continuous(limits = c(0, 0.05)) +
  labs(x = "library", y = "% contigs w/no reads mapped") +
  theme_standard

p2 <- ggplot(mapstats, aes(x = Lib, y = Reads_Mapped)) +
  geom_bar(stat = "identity", color = "black", fill = "darkorange") +
  labs(x = "library", y = "mean reads mapped per indv") +
  theme_standard

p3 <- ggplot(mapstats, aes(x = Lib, y = PROP_MISMATCH)) +
  geom_bar(stat = "identity", color = "black", fill = "darkorange") +
  labs(x = "",y = "% reads not mapped as pair") +
  theme_standard

multiplot(p1, p2, p3, cols = 3)

```

# SNP calling

Transfer copy of `reference.fasta` and associated files into SNP calling directory.

Create symlinks from all `fq`, `bam` and `bam.bai`-files for each separately mapped library in `SNP_Calling` folder.

```{bash softlink files, eval=FALSE, include=FALSE}

ln -s /home/soleary/GAFFTOPS/data/SEQ/BMA-1/*.fq.gz /home/soleary/GAFFTOPS/data/SNP_Calling
ln -s /home/soleary/GAFFTOPS/data/SEQ/BMA-1/*.bam* /home/soleary/GAFFTOPS/data/SNP_Calling

ln -s /home/soleary/GAFFTOPS/data/SEQ/BMA-2/*.fq.gz /home/soleary/GAFFTOPS/data/SNP_Calling
ln -s /home/soleary/GAFFTOPS/data/SEQ/BMA-2/*.bam* /home/soleary/GAFFTOPS/data/SNP_Calling

ln -s /home/soleary/GAFFTOPS/data/SEQ/BMA-3/*.fq.gz /home/soleary/GAFFTOPS/data/SNP_Calling
ln -s /home/soleary/GAFFTOPS/data/SEQ/BMA-3/*.bam* /home/soleary/GAFFTOPS/data/SNP_Calling

# remove unnecessary files
cd /home/soleary/GAFFTOPS/data/SNP_Calling
rm cat-RRG.bam*

```

Execute `dDocent` from within `SNP_Calling`-folder to call variants across all individuals (all libraries).

```{bash call SNPs, eval=FALSE, include=FALSE}

cd /home/soleary/GAFFTOPS/data/SNP_Calling
dDocent

```

File `TotalrawSNPs.vcf` contains all raw SNP/INDEL calls. Do not need to keep links of `fq.gz`-, `bam`-, `.bam.bai`-files after SNPs have been called. Copy `TotalRaSNPs.vcf` to `VCF` for SNP filtering.

```{bash copy TotalRawSNPs for filtering, eval=FALSE, include=FALSE}

cd /home/soleary/GAFFTOPS/data/SNP_Calling
rm *fq.gz *bam*

cp /home/soleary/GAFFTOPS/data/SNP_Calling/TotalRawSNPs.vcf /home/soleary/GAFFTOPS/data/VCF/temp/

```

# SNP filtering

`dDocent` uses `FreeBayes` to call SNPs and write a VCF-file `TotalrawSNPs.vcf`. This data set was filtered to remove low quality and artefactual SNP sites, paralogs and low quality individuals based on levels of missing data, minimum/maximum read depth, genotype call rate, and minor allele frequencies. Contigs may contain more than one SNP; the script `rad_haplotyper.pl` was used to create haplotypes for each locus.

## Raw data 

### Individuals/Populations sampled

Generate a list of all individuals included in the SNP data set called by FreeBayes in the dDocent pipeline.

```{bash write raw indv}

vcfsamplenames data/VCF/temp/TotalRawSNPs.vcf > data/VCF/raw.ind

```

Use `Ind_raw` file to write text files of individuals in each library and in regional groupings. 

```{r create Ind files, message=FALSE, warning=FALSE}

# import individuals in raw data set ----
Ind_RAW <- read_delim("data/VCF/raw.ind", delim = "\t", col_names = "LIB_ID") %>%
  separate(LIB_ID, into = c("SP", "WELL", "SAMPLE_ID"), sep = "_", remove = FALSE, extra = "merge")

# create files with individuals by population ----

SampleInfo <- read_delim("data/POPGEN/SampleInfo.txt", delim = "\t")

Ind_RAW <- left_join(Ind_RAW, SampleInfo)

pops <- unique(Ind_RAW$POP)

for (p in pops){
  
  df <- Ind_RAW %>%
    filter(POP == p) %>%
    select(LIB_ID)
  
  write_delim(df, paste("data/VCF/", p, ".ind", sep = ""),
            delim = "", col_names = FALSE)
  
}


# create files with individuals per library ----
temp <- Ind_RAW %>%
  filter(grepl("1A", WELL) |
         grepl("1B", WELL)) %>%
  select(LIB_ID)

write.table(temp, "data/VCF/BMA1.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

temp <- Ind_RAW %>%
  filter(grepl("2A", WELL) |
         grepl("2B", WELL)) %>%
  select(LIB_ID)

write.table(temp, "data/VCF/BMA2.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

temp <- Ind_RAW %>%
  filter(grepl("3A", WELL) |
         grepl("3B", WELL)) %>%
  select(LIB_ID)

write.table(temp, "data/VCF/BMA3.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

```

### Compare Individual & SNP stats

Use `VCFtools` to create stats files for depth, missing data, heterozygosity and site quality for the raw data.

```{bash query raw stats}

vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/BMA_raw --depth
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/BMA_raw --site-mean-depth
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/BMA_raw --site-quality
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/BMA_raw --missing-indv
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/BMA_raw --missing-site
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/BMA_raw --het
vcftools --vcf data/VCF/temp/TotalRawSNPs.vcf --out data/VCF/BMA_raw --singletons

```

Compare raw stats.
    
```{r stats raw, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_raw <- read.ind.stats(dir = "data/VCF", vcf = "BMA_raw") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_raw <- read.loc.stats(dir = "data/VCF/", vcf = "BMA_raw")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_raw, aes(x = MISS_BMA_raw)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.5),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_raw, aes(x = MEAN_DEPTH_BMA_raw)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_raw, aes(x = MEAN_DEPTH_BMA_raw, y = MISS_BMA_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.5),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot Fis per indv ----
p4 <- ggplot(ind_stats_raw, aes(x = Fis_BMA_raw)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Fis_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv") +
  theme_standard

# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_raw, aes(x = Fis_BMA_raw, y = MISS_BMA_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.5),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "% missing data") +
  theme_standard

# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_raw, aes(x = Fis_BMA_raw, y = MEAN_DEPTH_BMA_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "mean depth per indv") +
  theme_standard


# plot distribution missing data per locus ----
p7 <- ggplot(loc_stats_raw, aes(x = MISS_BMA_raw)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_raw, aes(x = MEAN_DEPTH_BMA_raw)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_raw, aes(x = MEAN_DEPTH_BMA_raw, y = MISS_BMA_raw)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA_raw, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/BMA_raw.lqual", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(PROB = 10^(-QUAL/10))

temp <- data.frame(loc_stats_raw$MEAN_DEPTH_BMA_raw, site_qual$QUAL) %>%
  rename(depth = loc_stats_raw.MEAN_DEPTH_BMA_raw, qual = site_qual.QUAL)

p10 <- ggplot(temp, aes(x = depth, y = qual)) +
  geom_point(size = 1) +
  geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "SNP quality") +
  theme_standard

# plot number of SNPs per contig vs. mean depth ----
temp <- loc_stats_raw %>%
  count(CHR)

p11 <- left_join(temp, loc_stats_raw) %>%
  ggplot() +
  geom_point(aes(x = n, y = MEAN_DEPTH_BMA_raw)) +
  labs(x = "number of SNPs per contig", y = "mean depth") +
  theme_standard

# plot no of SNPs per locus ----
p12 <- loc_stats_raw %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  labs(x = "number of SNPs per locus") +
  theme_standard

mraw <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)

```

## Choose threshold values for quality score, coverage, missing data, minor alleles and mapping/variant calling artifacts

### Filter 0: Remove LQ individuals

Remove (known) low quality individuals from data set:

Identify low quality individuals to remove from the data set; defined as individuals with a mean coverage of five or less reads across all loci, with more than 31% missing data, or Fis > (thresholds based on exploratory filtering).


```{r lq indv}

LQindv <- ind_stats_raw %>%
  filter(MEAN_DEPTH_BMA_raw <= 5 | MISS_BMA_raw >= 0.31 | Fis_BMA_raw > 0.2) %>%
  select(INDV)

write_delim(LQindv, "data/VCF/LQ_raw.ind", delim = "\t")

nrow(LQindv)

```

Remove low quality individuals and decompose indels.

```{bash filter lq indv, eval=FALSE, include=FALSE}

# decompose indels
vcfallelicprimitives data/VCF/temp/TotalRawSNPs.vcf --keep-info --keep-geno > data/VCF/temp/BMA.prim.vcf

# retain only SNPs / remove LQ individuals
vcftools --vcf data/VCF/temp/BMA.prim.vcf --out data/VCF/temp/BMA.F0 --remove-indels --remove data/VCF/LQ_raw.ind --recode --recode-INFO-all

# query stats
vcftools --vcf data/VCF/temp/BMA.F0.recode.vcf --out data/VCF/BMA.F0 --depth
vcftools --vcf data/VCF/temp/BMA.F0.recode.vcf --out data/VCF/BMA.F0 --site-mean-depth
vcftools --vcf data/VCF/temp/BMA.F0.recode.vcf --out data/VCF/BMA.F0 --missing-indv
vcftools --vcf data/VCF/temp/BMA.F0.recode.vcf --out data/VCF/BMA.F0 --missing-site
vcftools --vcf data/VCF/temp/BMA.F0.recode.vcf --out data/VCF/BMA.F0 --het
vcftools --vcf data/VCF/temp/BMA.F0.recode.vcf --out data/VCF/BMA.F0 --singletons
vcftools --vcf data/VCF/temp/BMA.F0.recode.vcf --out data/VCF/BMA.F0 --geno-depth

```

Compare stats:

```{r stats F0, fig.height=20, fig.width=10, message=TRUE, warning=TRUE}

# load stats files ----
ind_stats_F0 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F0") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE, extra = "merge")

loc_stats_F0 <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.F0")

```

Data set contains `r nrow(ind_stats_F0)` individuals and `r nrow(loc_stats_F0)` loci after removing low quality individuals and decomposing indels set into SNPs.

Import singletons and genotype depth file to create list of loci to exclude based on depth.

```{r read singleton genotype depth, message=FALSE, warning=FALSE}

# number of individuals
n <- nrow(ind_stats_F0)+2

# read singletons file
singletons <- read_table2("data/VCF/BMA.F0.singletons") %>%
  mutate(VARIANT = ifelse(ALLELE %in% c("A", "T", "C", "G"), "SNP", "INDEL"))

chrom <- unique(singletons$CHROM)

# read genotype depth file and join with singletons
gdepth <- read_table2("data/VCF/BMA.F0.gdepth") %>%
  filter(CHROM %in% chrom) %>%
  gather(key = INDV, value = DEPTH, 3:n)

singletons <- left_join(singletons, gdepth)
  
```

Data set contains `nrow(singletons)` singletons.

```{r doubletons}

# filter doubletons
doubletons <- singletons %>%
  filter(`SINGLETON/DOUBLETON` == "D")

```

Of those `nrow(doubletons)` are called as a homozygote in one individual.

```{r singletons per contig, fig.height=3, fig.width=4}

# number of contigs in data set
contigs_total <- loc_stats_F0 %>%
  distinct(CHR)

contigs_singletons <- singletons %>%
  distinct(CHROM)

contigs <- singletons %>%
  count(CHROM)

ggplot(contigs, aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "number of singletons per locus") +
  theme_standard

```

The data set contains `nrow(contigs_total)` contigs, `nrow(contigs_singletons)` (`2round( (nrow(contigs_singletons)/nrow(contigs_total)*100), digits = 2)`%) contain singletons.

```{r distribution singpletons per contig, fig.height=3, fig.width=6}

contigs <- singletons %>%
  group_by(`SINGLETON/DOUBLETON`) %>%
  count(CHROM)

ggplot(contigs, aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  facet_grid(. ~ `SINGLETON/DOUBLETON`) +
  labs(x = "number of doubletons/singeltons per locus") +
  theme_standard

```

Target coverage is 20 reads per locus; flag singletons with depth < 10 reads and doubletons with depth < 20 reads.

```{r}

singletons %>% 
  group_by(`SINGLETON/DOUBLETON`) %>%
  count(DEPTH <= 10)

d <- singletons %>%
  filter(`SINGLETON/DOUBLETON` == "D" & DEPTH <= 20) 
  
LQ <- singletons %>%
  filter(DEPTH <= 10) %>%
  bind_rows(d) %>%
  select(CHROM, POS)

write_delim(LQ, "data/VCF/LQ_F0.loci", delim = "\t", col_names = FALSE)

```

Distriubtion of genotype depth per singleton/doubleton.

```{r distribution geno depth per singleton, fig.height=3, fig.width=6}

ggplot(singletons, aes(x = DEPTH)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  facet_grid(. ~ `SINGLETON/DOUBLETON`, scales = "free") +
  labs(x = "read depth") +
  scale_y_sqrt() +
  theme_standard

```

Quantiles distribution of read depth for SNP loci called in only one individual.

```{r quantile singleton depth}

quantile(singletons$DEPTH, probs = c(.05, .25, .5, .75, .95, .99))

```

Distribution of doubletons (homzygous genotype for that individual) and singletons (heterozygote genotype).

```{r distribution no singletons per indv, fig.height=3, fig.width=6, message=FALSE, warning=FALSE}

Ind <- singletons %>%
  group_by(`SINGLETON/DOUBLETON`) %>%
  count(INDV)

Ind <- left_join(Ind, ind_stats_F0)

ggplot(Ind, aes(x = n)) +
  geom_histogram(binwidth = 50, color = "black", fill = "darkorange") + 
  facet_grid(. ~ `SINGLETON/DOUBLETON`) +
  labs(x = "number of singletons per indv") +
  theme_standard

```

Quantiles distribution number of singletons called in one individual.

```{r quantile singletons per indv}

quantile(Ind$n, probs = c(.01, .05, .25, .5, .75, .99))

```

Compare the number of SNPs vs complex variants (INDELs).

```{r singleton variant type}

count(singletons, VARIANT)

```

### Filter 1: Confidence in SNP call
    
The QUAL column of a VCF file is a phred based score indicating the probability that the variant shown in the ALT column is wrong. Given the Phred quality score (Q), and the probability that a base is incorrectly called (P), Q = -10(Log10P). A quality score of 20 indicates, a 1 in 100 chance that the SNP site has been called incorrectly (i.e. 99% probability that correct call).

Filter loci with quality score < 20 and singletons/doubletons with low read depth (<10/20 reads). Code genotypes with genotype call quality < 20 or  genotype depth < 5 as missing.

```{bash filter LQ SNP calls}

# filter LQ SNP calls
vcftools --vcf data/VCF/temp/BMA.F0.recode.vcf --out data/VCF/temp/BMA.F1 --minQ 20 --minGQ 20 --minDP 5 --exclude-positions data/VCF/LQ_F0.loci --recode --recode-INFO-all

# query stats
vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/BMA.F1 --depth
vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/BMA.F1 --site-mean-depth
vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/BMA.F1 --missing-indv
vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/BMA.F1 --missing-site
vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/BMA.F1 --het
vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/BMA.F1 --geno-depth
vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/BMA.F1 --singletons
vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/BMA.F1 --indv-freq-burden
vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/BMA.F1 --freq 

```

Compare stats post-filtering.

```{r stats F1, fig.height=12, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F1 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F1") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F1 <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.F1")

```

Data set contains `r nrow(ind_stats_F1)` individuals and `r nrow(loc_stats_F1)` SNP sites.

Removing low confidence SNP loci and genotype calls results in a reduction in the number of the (maximum) SNPs per locus.

```{r N SNPs, fig.height=6, fig.width=5, message=FALSE, warning=FALSE}

p1 <- loc_stats_raw %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 100)) +
  labs(x = "number of SNPs per locus") +
  theme_standard

p2 <- loc_stats_F1 %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 100)) +
  labs(x = "number of SNPs per locus") +
  theme_standard

m1a <- multiplot(p1, p2, cols=1)

```

Coding genotypes with low read depth (< 5) as missing, results in an overall increase in missing data per locus and a shift in more individuals with more missing data - this is because individuals (and loci) with coverage issues are now characterized by higher missing data.

```{r missing data F1, fig.height=4, fig.width=8, message=FALSE, warning=FALSE}

iraw <- read.table("data/VCF/BMA_raw.imiss",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, F_MISS) %>%
  rename(raw = F_MISS)

imiss <- read.table("data/VCF/BMA.F1.imiss",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, F_MISS) %>%
  rename(F1 = F_MISS)

imiss <- left_join(imiss, iraw)

p1 <- ggplot(imiss, aes(x = raw, y = F1)) +
  geom_point(shape = 1) +
  geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 1)) +
  scale_y_continuous(limits = c(0, 1)) +
  labs(x = "indv missing data raw", y = "indv missing data F1") +
  theme_standard

lraw <- read.table("data/VCF/BMA_raw.lmiss",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  rename(raw = F_MISS)

lmiss <- read.table("data/VCF/BMA.F1.lmiss",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(CHR, POS, F_MISS) %>%
  rename(F1 = F_MISS)

lmiss <- left_join(lmiss, lraw)

p2 <- ggplot(lmiss, aes(x = raw, y = F1)) +
  geom_point(shape = 1) +
  geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 1)) +
  scale_y_continuous(limits = c(0, 1)) +
  labs(x = "loci missing data raw", y = "loci missing data F1") +
  theme_standard

m1b <- multiplot(p1, p2, cols=2)

```

Compare depth individuals after removing LQ SNP loci and genotypes.

```{r depth F1, fig.height=4, fig.width=5, message=FALSE, warning=FALSE}

iraw <- read.table("data/VCF/BMA_raw.idepth",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, MEAN_DEPTH) %>%
  rename(raw = MEAN_DEPTH)

idepth <- read.table("data/VCF/BMA.F1.idepth",
                    header = TRUE, stringsAsFactors = FALSE) %>%
  select(INDV, MEAN_DEPTH) %>%
  rename(F1 = MEAN_DEPTH)

idepth <- left_join(iraw, idepth)

ggplot(idepth, aes(x = raw, y = F1)) +
  geom_point(shape = 1) +
  geom_abline(slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
  scale_x_continuous(limits = c(0, 150)) +
  scale_y_continuous(limits = c(0, 150)) +
  labs(x = "mean depth ind raw", y = "mean depth ind F1") +
  theme_standard

```

### Filter 2: Genotype call rate and allowed missing data per indv

Remove loci with genotype call rate < 50% and minimum mean depth < 15 reads across all individuals.

```{bash 2a geno < 50}

vcftools --vcf data/VCF/temp/BMA.F1.recode.vcf --out data/VCF/temp/BMA.F2a --max-missing 0.5 --min-meanDP 15 --recode --recode-INFO-all

# library BMA-1
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/BMA1 --keep data/VCF/BMA1.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA1.recode.vcf --out data/VCF/BMA1 --missing-site

# library BMA-2
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/BMA2 --keep data/VCF/BMA2.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA2.recode.vcf --out data/VCF/BMA2 --missing-site

# library BMA-3
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/BMA3 --keep data/VCF/BMA3.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA3.recode.vcf --out data/VCF/BMA3 --missing-site

# Campeche
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/CAMP --keep data/VCF/CAMP.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CAMP.recode.vcf --out data/VCF/CAMP --missing-site

# Chandeleur Sound
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/CS --keep data/VCF/CS.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CS.recode.vcf --out data/VCF/CS --missing-site

# Corpus Christi Bay
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/CC --keep data/VCF/CC.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/CC.recode.vcf --out data/VCF/CC --missing-site

# Florida Atlantic
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/FLA --keep data/VCF/FLA.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/FLA.recode.vcf --out data/VCF/FLA --missing-site

# Florida Gulf South
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/FLGS --keep data/VCF/FLGS.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/FLGS.recode.vcf --out data/VCF/FLGS --missing-site

# Florida Gulf North
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/FLGN --keep data/VCF/FLGN.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/FLGN.recode.vcf --out data/VCF/FLGN --missing-site

# Louisiana
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/LA --keep data/VCF/LA.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/LA.recode.vcf --out data/VCF/LA --missing-site

# Mississippi Sound
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/MISS --keep data/VCF/MISS.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/MISS.recode.vcf --out data/VCF/MISS --missing-site

# Mobile Bay
vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/MB --keep data/VCF/MB.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/MB.recode.vcf --out data/VCF/MB --missing-site

```

Compare distribution of missing data per locus per library.

```{r lmiss per lib, fig.height=4, fig.width=3, message=FALSE, warning=FALSE}

# create empty list
loc_missing <- list()

for (i in 1:3) {
  
  lib <- paste("BMA", i, sep = "")
  
  loc_missing[[i]] <- read_delim(paste("data/VCF/", lib, ".lmiss", sep = ""), delim = "\t") %>%
    select(CHR, POS, F_MISS) %>%
    mutate(LIB = lib)
  
}

# create data frame with all information
loc_missing <- ldply(loc_missing, data.frame)

ggplot(loc_missing, aes(x = F_MISS)) +
  geom_histogram(binwidth = .1, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = 0.5),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "missing data per locus") +
  facet_grid(LIB ~ .) +
  theme_standard

```

Flag loci that were not called in > 50% of individuals in a given library.

```{r}

# identify loci with high missing data in each library
SNPs <- filter(loc_missing, F_MISS > 0.5) %>%
  arrange(CHR, POS)

count(SNPs, LIB)

LQloci_lib <- SNPs %>%
  select(CHR, POS) %>%
  unique()

```

A total of `r LQloci_lib` loci were called in less than 50% of individuals in one or more libraries. Loci being inconsistently called between libraries can result in library effects.

Compare distribution of missing data per locus per sample location.

```{r lmiss per loc, fig.height=10, fig.width=4, message=FALSE, warning=FALSE}

# create empty list
loc_missing <- list()

# pops to loop over
pop <- unique(Ind_RAW$POP)

# import missing data per locus
for (p in pop) {
  
  loc_missing[[p]] <- read_delim(paste("data/VCF/", p, ".lmiss", sep = ""), delim = "\t") %>%
    select(CHR, POS, F_MISS) %>%
    mutate(POP = p)
  
}

# create data frame with all information
loc_missing <- ldply(loc_missing, data.frame) %>%
  select(-`.id`)

ggplot(loc_missing, aes(x = F_MISS)) +
  geom_histogram(binwidth = .05, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = 0.25),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "missing data per locus") +
  facet_grid(POP ~ .) +
  scale_y_sqrt() +
  theme_standard

```

Flag loci that were not called in > 75% of individuals at a given sample location.

```{r}

# identify loci with high missing data in each library
SNPs <- filter(loc_missing, F_MISS > 0.25) %>%
  arrange(CHR, POS)

count(SNPs, POP)

LQloci_pop <- SNPs %>%
  select(CHR, POS) %>%
  unique()

LQloci <- bind_rows(LQloci_lib, LQloci_pop) %>%
  unique()

# Write contig/position to file
write.table(LQloci, file = "data/VCF/LQ_F2b.loci", 
            col.names= FALSE, row.names = FALSE, quote = FALSE)

```

Remove loci that were not consistently called across all libraries and sample locations.

```{bash filter missing by lib}

vcftools --vcf data/VCF/temp/BMA.F2a.recode.vcf --out data/VCF/temp/BMA.F2b --exclude-positions data/VCF/LQ_F2b.loci --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA.F2b.recode.vcf --out data/VCF/BMA.F2b --missing-indv

```

Identify individuals with > 25% missing data

```{r imiss > 50, fig.height=3, fig.width=4}

# determine cutoff
imiss <- read.table("data/VCF/BMA.F2b.imiss", 
           header = TRUE, stringsAsFactors = FALSE) 

ggplot(imiss, aes(x = F_MISS)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 0.25, color = "darkblue", linetype = "dashed", size = 1) +
  theme_standard

imiss <- imiss %>%
  filter(F_MISS > 0.25) %>%
  select(INDV)

write.table(imiss, "data/VCF/F2b_LQ.indv",
            col.names = TRUE, row.names = FALSE, quote = FALSE)

```

Remove flagged individuals.

```{bash filter LQ indv}

vcftools --vcf data/VCF/temp/BMA.F2b.recode.vcf --out data/VCF/temp/BMA.F2 --remove data/VCF/F2b_LQ.indv --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/BMA.F2 --depth
vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/BMA.F2 --site-mean-depth
vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/BMA.F2 --missing-indv
vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/BMA.F2 --missing-site
vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/BMA.F2 --het
vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/BMA.F2 --geno-depth

```

Analyze stats post-filtering:

```{r stats F2, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F2 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F2") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F2 <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.F2")

```


### Filter 3: Filter loci and individuals based on depth, variance in depth and genotype call rate

Determine mean depth and variance per locus per library.

```{bash depth data per lib}

# library BMA-1
vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/temp/BMA1 --keep data/VCF/BMA1.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA1.recode.vcf --out data/VCF/BMA1 --site-mean-depth

# library BMA-2
vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/temp/BMA2 --keep data/VCF/BMA2.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA2.recode.vcf --out data/VCF/BMA2 --site-mean-depth

# library BMA-3
vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/temp/BMA3 --keep data/VCF/BMA3.ind  --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA3.recode.vcf --out data/VCF/BMA3 --site-mean-depth

```

Compare distribution of depth coverage per locus per library. Identify loci that do not have consistent coverage between libraries (can lead to library effects), and/or across individuals. 

```{r depth per locus per lib, fig.height=8, fig.width=4, message=FALSE, warning=FALSE}

# create empty list
loc_depth <- list()

# import depth data per locus
loc_depth[[1]] <- read.table("data/VCF/BMA1.ldepth.mean", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(LIB = "BMA1")

loc_depth[[2]] <- read.table("data/VCF/BMA2.ldepth.mean", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(LIB = "BMA2")

loc_depth[[3]] <- read.table("data/VCF/BMA3.ldepth.mean", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(LIB = "BMA3")

# create data frame with all information
loc_depth <- ldply(loc_depth, data.frame)

ggplot(loc_depth, aes(x = MEAN_DEPTH)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH, na.rm = TRUE)),
             color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 20,
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus") +
  facet_grid(LIB ~ .) +
  theme_standard

```

Identify loci with large variance in mean depth across libraries and/or individuals by calculating the coefficient of variance (STD/MEAN).

```{r calculate variation in depth, message=FALSE, warning=FALSE}

# calculate mean depth weighted by library
depth_comp <- loc_depth %>%
  group_by(LIB) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  select(-POS) %>%
  ungroup() %>%
  group_by(CHROM) %>%
  summarise(MEAN = mean(MEAN_DEPTH),
            STD = sd(MEAN_DEPTH))

# mean depth across all individuals
temp <- read.table("data/VCF/BMA.F2.ldepth.mean", 
                   header = TRUE, stringsAsFactors = FALSE) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  select(-POS) %>%
  mutate(STD_DEPTH = sqrt(VAR_DEPTH))

# calculate coefficient of variation
depth_comp <- left_join(depth_comp, temp) %>%
  mutate(COEFF_VAR_LIB = STD/MEAN*100,
         COEFF_VAR_IND = STD_DEPTH/MEAN_DEPTH*100)

```

Compare mean across all individual to mean weighted by library and coefficient of variance of read depth across individuals and between libraries:

```{r mean std depth, fig.height=7, fig.width=8.5}

p1 <- ggplot(depth_comp, aes(x = MEAN_DEPTH, y = MEAN)) +
  geom_point(shape = 1) +
  labs(x = "mean depth per locus", y = "mean weighted by lib") +
  geom_abline(slope = 1, linetype = "dashed", color = "darkred", size = 1) +
  theme_standard

p2 <- ggplot(depth_comp, aes(x = COEFF_VAR_IND)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = quantile(COEFF_VAR_IND, .99)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "coeff var depth across all indv") +
  theme_standard

p3 <- ggplot(depth_comp, aes(x = COEFF_VAR_IND, y = COEFF_VAR_LIB)) +
  geom_point(shape = 1) +
  scale_x_continuous(limits = c(0, 200)) +
  scale_y_continuous(limits = c(0, 100)) +
  labs(x = "coeff var depth across all indv", y = "coeff var mean depth betw lib") +
  theme_standard

p4 <- ggplot(depth_comp, aes(x = COEFF_VAR_LIB)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = quantile(COEFF_VAR_LIB, .99)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "coeff var depth between lib") +
  theme_standard

m3a <- multiplot(p1, p2, p3, p4, cols=2)

```

If loci have consistent coverage across loci the mean read depth per locus across all individuals and weighted by library should fall on the red diagonal. 

Flag loci that have high variation in mean coverage between individuals and between libraries.

```{r LQ loci variation coverage}

# identify loci high variation in coverage
contigs_var <- depth_comp %>%
  filter(COEFF_VAR_LIB > 90 | COEFF_VAR_IND > 150)

SNPs_var <- filter(loc_depth, CHROM %in% contigs_var$CHROM) %>%
  distinct(CHROM, POS)

# Write contig/position to text file, use file with vcftools to remove positions from dataset 
write.table(SNPs_var, file = "data/VCF/LQ_F3.loci", 
            col.names= FALSE, row.names = FALSE, quote = FALSE)

```

Remove loci flagged for high variance in depth across all individuals and between libraries (removes library effects), filter loci with mean depth < 20 and genotype call rate < 75%

```{bash filter variance in coverage loci}

vcftools --vcf data/VCF/temp/BMA.F2.recode.vcf --out data/VCF/temp/BMA.F3a --exclude-positions data/VCF/LQ_F3.loci --min-meanDP 15 --max-missing 0.75 --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA.F3a.recode.vcf --out data/VCF/BMA.F3a --depth
vcftools --vcf data/VCF/temp/BMA.F3a.recode.vcf --out data/VCF/BMA.F3a --geno-depth

```

Compare mean depth and number of sites called per individual.

```{r idepth sites vs depth I, fig.height=3, fig.width=4.5, message=FALSE, warning=FALSE}

read_table2("data/VCF/BMA.F3a.idepth") %>%
  ggplot(aes(x = N_SITES, y = MEAN_DEPTH)) +
  geom_point(shape = 1, size = 1.5) +
  geom_hline(yintercept = 10, linetype = "dashed", color = "darkred", size = 0.75) +
  geom_hline(yintercept = 20, linetype = "dashed", color = "darkblue", size = 0.75) +
  labs(x = "number of sites", y = "mean read depth") +
  theme_standard

```

Use genotype depth file to identify individuals with high variance in read depth across loci.

```{r variance depth individual I}

# number of individuals
n <- nrow(ind_stats_F2)+1

# read genotype depth & code values < 5 as missing
gdepth <- read_table2("data/VCF/BMA.F3a.gdepth") %>%
  select(-POS) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  gather(key = INDV, value = DEPTH, 3:n) %>%
  mutate(DEPTH = as.numeric(gsub(-1, 0, DEPTH)),
         DEPTH = as.numeric(gsub("\\<1\\>", 0, DEPTH)),
         DEPTH = as.numeric(gsub("\\<2\\>", 0, DEPTH)),
         DEPTH = as.numeric(gsub("\\<3\\>", 0, DEPTH)),
         DEPTH = as.numeric(gsub("\\<4\\>", 0, DEPTH)))

# calculate summary statistics
idepth <- gdepth %>%
  group_by(INDV) %>%
  summarize(TOTAL = sum(DEPTH),
            MAX = max(DEPTH),
            MEAN_NON0 = mean(DEPTH[DEPTH > 0]),
            MEAN = mean(DEPTH),
            MEDIAN = median(DEPTH),
            VAR = var(DEPTH),
            STD = sd(DEPTH)) %>%
  mutate(COEFF_VAR = STD/MEAN,
         RATIO_MEAN_MEDIAN = MEAN/MEDIAN)

idepth <- left_join(idepth, ind_stats_F2)

```

Compare variability of depth within individuals.

```{r, fig.height=15, fig.width=10}

p1 <- ggplot(idepth, aes(x = MAX, y = TOTAL)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p2 <- ggplot(idepth, aes(x = TOTAL, y = VAR)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p3 <- ggplot(idepth, aes(MAX, MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p4 <- ggplot(idepth, aes(x = MEAN, y = MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p5 <- ggplot(idepth, aes(x = COEFF_VAR, y = RATIO_MEAN_MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p6 <- ggplot(idepth, aes(MEAN)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 15, color = "darkred", linetype = "dashed") +
  theme_standard

p7 <- ggplot(idepth, aes(MEDIAN)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 10, color = "darkred", linetype = "dashed") +
  theme_standard

p8 <- ggplot(idepth, aes(RATIO_MEAN_MEDIAN)) +
  geom_histogram(binwidth = 0.1, color = "black", fill = "darkorange") +
  theme_standard

p9 <- ggplot(idepth, aes(COEFF_VAR)) +
  geom_histogram(binwidth = 0.025, color = "black", fill = "darkorange") +
  theme_standard

p10 <- ggplot(idepth, aes(x = COEFF_VAR, y = MISS_BMA.F2)) +
  geom_point(shape = 1) +
  geom_hline(yintercept = 0.25, linetype = "dashed", color = "darkred", size = 0.75) +
  theme_standard


m3b <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, cols=2)

```

Flag LQ individuals.

```{r}

LQ_depth <- idepth %>%
 filter(MEAN <= 10 | MEDIAN <= 5) %>%
 select(INDV)

LQ_depth
 
write.table(LQ_depth, "data/VCF/F3_LQ.indv",
           col.names = TRUE, row.names = FALSE, quote = FALSE)

```

Remove flagged loci and individuals.

```{bash filter variance in coverage}

vcftools --vcf data/VCF/temp/BMA.F3a.recode.vcf --out data/VCF/temp/BMA.F3 --remove data/VCF/F3_LQ.indv --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA.F3.recode.vcf --out data/VCF/BMA.F3 --depth
vcftools --vcf data/VCF/temp/BMA.F3.recode.vcf --out data/VCF/BMA.F3 --site-mean-depth
vcftools --vcf data/VCF/temp/BMA.F3.recode.vcf --out data/VCF/BMA.F3 --missing-indv
vcftools --vcf data/VCF/temp/BMA.F3.recode.vcf --out data/VCF/BMA.F3 --missing-site
vcftools --vcf data/VCF/temp/BMA.F3.recode.vcf --out data/VCF/BMA.F3 --het

```

Compare stats post-filtering:

```{r stats F3, fig.height=10, fig.width=8, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F3 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F3") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F3 <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.F3")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_F3, aes(x = MISS_BMA.F3)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_F3, aes(x = MEAN_DEPTH_BMA.F3)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_F3, aes(x = MEAN_DEPTH_BMA.F3, y = MISS_BMA.F3)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot distribution missing data per locus ----
p4 <- ggplot(loc_stats_F3, aes(x = MISS_BMA.F3)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p5 <- ggplot(loc_stats_F3, aes(x = MEAN_DEPTH_BMA.F3)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p6 <- ggplot(loc_stats_F3, aes(x = MEAN_DEPTH_BMA.F3, y = MISS_BMA.F3)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F3, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

m3 <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)

```

### Filter 4: Allele balance

AB: Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous

```{bash query INFO stats I, eval=FALSE, include=FALSE}

cut -f8 BMA.F3.recode.vcf | grep -oe "AB=[[:digit:]].[[:digit:]][[:digit:]][[:digit:]]" | sed -s 's/AB=//g' > BMA.F4.AB

```

Allele balance is the ratio of reads for reference allele to all reads, considering only reads from individuals called as heterozygous. Values range from 0 - 1; allele balance (for real loci) should be approx. 0.5. Filter SNPs for which the with allele balance < 0.25 and > 0.75.

```{r plot AB, fig.height=4, fig.width=5}

read.table("data/VCF/temp/BMA.F4.AB",
           col.names = "AB", stringsAsFactors = FALSE) %>%
  ggplot(aes(x = AB)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 0.5, color = "red", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 0.25, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 0.75, color = "darkblue", linetype = "dashed", size = 1) +
  theme_standard

```

Filter contigs with SNP calls with AB > 0.25, AB > 0.75; retain loci very close to 0 (retain loci that are fixed variants). Remove genotypes if the quality sum of the reference or alternate allele was 0.

```{bash, eval=FALSE, include=FALSE}

# this works... not sure why cant write to file in the folder
vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01 | AB > 0.99" -s -g "QR > 0 | QA > 0" BMA.F3.recode.vcf > BMA.F4.vcf 
mawk '!/#/' BMA.F4.vcf | wc -l

```

Remaining SNPs: 52,610.


### Filter 5: quality/depth ratio

```{bash}

# site depth
cut -f8 BMA.F4.vcf | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > BMA.4.DEPTH

# quality score
mawk '!/#/'  BMA.F4.vcf | cut -f1,2,6 > BMA.F4.loci.qual

```

Compare quality/depth ratio.

```{r fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

# depth
depth <- read.table("data/VCF/temp/BMA.4.DEPTH",
                    col.names = "depth")

# quality score
qual <- read.table("data/VCF/temp/BMA.F4.loci.qual",
                   col.names = c("locus", "pos", "qual"))

temp <- bind_cols(qual, depth) %>%
  mutate(ratio = qual/depth)

ggplot(temp, aes(x = ratio)) +
  geom_histogram(binwidth = 0.25, color = "black", fill = "grey85") +
  geom_vline(xintercept = 0.2, color = "darkred", linetype = "dashed", size = 1) +
  theme_standard

```

Remove loci with quality/depth ratio < 0.2

```{bash}

vcffilter -s -f "QUAL / DP > 0.2" BMA.F4.vcf > BMA.F5.vcf 

mawk '!/#/' BMA.F5.vcf | wc -l

```

Remaining SNPs: 31,880.

### Filter 6: mapping quality

```{bash query mapping quality, eval=FALSE, include=FALSE}

cut -f8 BMA.F5.vcf | grep -oe "MQM=[0-9]*" | sed -s 's/MQM=//g' > BMA.F5.MQM

cut -f8 BMA.F5.vcf | grep -oe "MQMR=[0-9]*" | sed -s 's/MQMR=//g' > BMA.F5.MQMR

```

Remove loci based on ratio of mapping quality for reference and alternate allele, i.e. sites that have a high discrepancy between the mapping qualities of two alleles.

```{r plot map qual ratios, fig.height=4, fig.width=5}

temp <- read.table("data/VCF/temp/BMA.F5.MQM", col.names = "MQM")

mapqual <- read.table("data/VCF/temp/BMA.F5.MQMR", col.names = "MQMR")

mapqual <- bind_cols(mapqual, temp) %>%
  mutate(ratio = MQM/MQMR)

filter <- mapqual %>%
  filter(ratio < 0.25 | ratio > 1.75)

ggplot(mapqual, aes(x = MQM, y = MQMR)) +
  geom_point(shape = 1, size = 1) + 
  geom_abline(intercept = 0, slope = 1, size = 1, color = "red", linetype = "dashed") +
  geom_abline(intercept = 0, slope = 4, size = 1, color = "darkblue", linetype = "dashed") +
  geom_abline(intercept = 0, slope = 0.571, size = 1, color = "darkblue", linetype = "dashed") +
  geom_point(data = filter, aes(x = MQM, y = MQMR), shape = 21, color = "black", fill = "red") +
  scale_x_continuous(limits = c(0, 65)) +
  scale_y_continuous(limits = c(0, 65)) +
  theme_standard

```

Filter loci with mapping quality ratio < 0.25 and > 1.75.

```{bash filter map qual ratio}

vcffilter -s -f "MQM / MQMR > 0.25 & MQM / MQMR < 1.75" BMA.F5.vcf > BMA.F6.vcf

mawk '!/#/' BMA.F6.vcf | wc -l

```

Remaining SNPs: 30,183.

### Filter 7: Strand balance

SRF: Number of reference observations on the forward strand
SAF: Number of alternate observations on the forward strand
SRR: Number of reference observations on the reverse strand
SAR: Number of alternate observations on the reverse strand

```{bash query strandedness, eval=FALSE, include=FALSE}

cut -f8 BMA.F6.vcf | grep -oe "SAF=[0-9]*" | sed -s 's/SAF=//g' > BMA.F6.SAF

cut -f8 BMA.F6.vcf | grep -oe "SAR=[0-9]*" | sed -s 's/SAR=//g' > BMA.F6.SAR

cut -f8 BMA.F6.vcf | grep -oe "SRF=[0-9]*" | sed -s 's/SRF=//g' > BMA.F6.SRF

cut -f8 BMA.F6.vcf | grep -oe "SRR=[0-9]*" | sed -s 's/SRR=//g' > BMA.F6.SRR

```

Paired end reads should not overlap, and a SNP site should only be covered by either the forward or reverse read (strand). 

```{r plot strandedness, fig.height=4, fig.width=5}

SAF <- read.table("data/VCF/temp/BMA.F6.SAF",
                  col.names = "SAF")

SAR <- read.table("data/VCF/temp/BMA.F6.SAR",
                  col.names = "SAR")

strands <- bind_cols(SAF, SAR)

SRF <- read.table("data/VCF/temp/BMA.F6.SRF",
                  col.names = "SRF")

strands <- bind_cols(strands, SRF)

SRR <- read.table("data/VCF/temp/BMA.F6.SRR",
                  col.names = "SRR")

strands <- bind_cols(strands, SRR) %>%
  mutate(ratioA = SAF/SAR, ratioR = SRF/SRR)

ggplot(strands, aes(x = SAF, y = SAR)) +
  geom_point(shape = 1) +
  geom_abline(intercept = 0, slope = 0.1, color = "darkblue", linetype = "dashed", size = 1) +
  geom_abline(intercept = 0, slope = 100, color = "darkblue", linetype = "dashed", size = 1) +
  theme_standard

```

Remove SNP sites that have > 100x more forward alternate reads than reverse alternate reads and > 100x more forward reverse reads than reverse alternate reads.

```{bash filter strandedness}

vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100" -s BMA.F6.vcf > BMA.F7.vcf

mawk '!/#/' BMA.F7.vcf | wc -l

```

Number of SNPs remaining: 26,722.

### Filter 8: Properly paired status

PAIRED: Proportion of observed alternate alleles which are supported by properly paired read fragments
PAIREDR: Proportion of observed reference alleles which are supported by properly paired read fragments

Identify loci with only unpaired reads mapping to them - an artifact introduced due de novo reference assembly. Compare number of paired reads mapping the reference and the alternate alleles to identify discrepancy in the paired status for reads supporting reference and alternate alleles.

```{bash filter paired status, eval=FALSE, include=FALSE}

vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s BMA.F7.vcf > BMA.F8.vcf

mawk '!/#/' BMA.F8.vcf | wc -l

```

Number of SNPs in data set: 23,778.

### Filter 9: Maximum depth & Quality

Identify distribution of depth (based on original data set) to identify loci with excess coverage.

Original number of individuals in data set is `r nrow(ind_stats_raw)` (INFO flags in filtered data set are are based on original number of individuals in data set).

Create file with the original site depth and quality score for each site:

```{bash, eval=FALSE, include=FALSE}

# site depth
cut -f8 BMA.F8.vcf | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > BMA.F8.DEPTH

# quality score
mawk '!/#/'  BMA.F8.vcf | cut -f1,2,6 > BMA.F8.loci.qual

```

Calculate average depth and standard deviation:

```{r plot depth vs qual, fig.height=3, fig.width=5}

# depth
depth <- read.table("data/VCF/temp/BMA.F8.DEPTH",
                    col.names = "depth")

# quality score
qual <- read.table("data/VCF/temp/BMA.F8.loci.qual",
                   col.names = c("locus", "pos", "qual"))

# mean depth
mean_depth <- mean(depth$depth)

# standard deviation
std <- sd(depth$depth)

# calculate cutoff
cutoff <- sum(mean_depth + (2*std))

# identify SNPs with excess (i.e. depth > mean depth + 1 standard deviation
# and quality score < 2x the depth at that site
df <- bind_cols(qual, depth) %>%
  mutate(qualcutoff = 2*depth)

removeloc <- df %>%
  filter(depth > cutoff) %>%
  filter(qual < 2*depth)

# plot
ggplot(df, aes(x = depth, y = qual)) +
  geom_point(shape = 1) +
  geom_point(data = removeloc, aes(x = depth, y = qual), shape = 21, color = "black", fill = "red") +
  geom_line(data = df, aes(x = depth, y = qualcutoff), color = "blue",  linetype = "dashed", size = 1) +
  geom_vline(xintercept = cutoff, color = "blue", linetype = "dashed", size = 1) +
  theme_standard

LQ <- removeloc %>%
  select(locus, pos)

write.table(LQ, "data/VCF/temp/LQ_F9.loci", 
            col.names = FALSE, row.names = FALSE, quote = FALSE)

```

Mean depth per locus (across all indivuals) is `r mean_depth` and the standard deviation is `r std`. 

Filter SNP site with depth > mean depth + 1 standard deviation = `r sum(mean_depth + 2*std)` and that have quality scores < 2x the depth at that site and output depth per site.

```{bash filter depth qual, eval=FALSE, include=FALSE}

vcftools --vcf  data/VCF/temp/BMA.F8.vcf --exclude-positions data/VCF/temp/LQ_F9.loci --recode --recode-INFO-all --out data/VCF/temp/BMA.F9a

vcftools --vcf data/VCF/temp/BMA.F9a.recode.vcf --out data/VCF/BMA.F9a --site-mean-depth

```

Compare the distribution of mean depth per site averaged across individuals to determine cut-off value of sites with excessively high depth indicative of paralogs/multicopy loci.

```{r plot depth dist, fig.height=3, fig.width=5, message=FALSE, warning=FALSE}

# calculate mean depth per site (177 individuals)
read.table("data/VCF/BMA.F9a.ldepth.mean",
           header = TRUE, stringsAsFactors = FALSE) %>% 
  ggplot(aes(x = MEAN_DEPTH)) +
  geom_histogram(binwidth = 20, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(MEAN_DEPTH, .95)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(MEAN_DEPTH, .99)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  scale_y_sqrt() +
  labs(x = "mean depth per site") +
  theme_standard

```

Choose cut-off for maximum mean read depth = 300.

```{bash filter max depth}

vcftools --vcf  data/VCF/temp/BMA.F8.vcf --out data/VCF/temp/BMA.F9 --max-meanDP 300 --exclude-positions data/VCF/temp/LQ_F9.loci --recode --recode-INFO-all 

vcftools --vcf data/VCF/temp/BMA.F9.recode.vcf --out data/VCF/BMA.F9 --depth
vcftools --vcf data/VCF/temp/BMA.F9.recode.vcf --out data/VCF/BMA.F9 --site-mean-depth
vcftools --vcf data/VCF/temp/BMA.F9.recode.vcf --out data/VCF/BMA.F9 --missing-indv
vcftools --vcf data/VCF/temp/BMA.F9.recode.vcf --out data/VCF/BMA.F9 --missing-site
vcftools --vcf data/VCF/temp/BMA.F9.recode.vcf --out data/VCF/BMA.F9 --het

```

Depth distribution per locus after filtering:

```{r fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

read_table2("data/VCF/BMA.F9.ldepth.mean") %>%
  ggplot(aes(x = MEAN_DEPTH)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  theme_standard

```

Analyze stats post-filtering:

```{r stats F9, fig.height=10, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F9 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F9") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F9 <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.F9")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_F9, aes(x = MISS_BMA.F9)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_F9, aes(x = MEAN_DEPTH_BMA.F9)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_F9, aes(x = MEAN_DEPTH_BMA.F9, y = MISS_BMA.F9)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot distribution missing data per locus ----
p4 <- ggplot(loc_stats_F9, aes(x = MISS_BMA.F9)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p5 <- ggplot(loc_stats_F9, aes(x = MEAN_DEPTH_BMA.F9)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p6 <- ggplot(loc_stats_F9, aes(x = MEAN_DEPTH_BMA.F9, y = MISS_BMA.F9)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F9, na.rm = TRUE)),
             color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
             color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

m3 <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)

```

Data set contains `r nrow(loc_stats_F9)` SNP sites and `r nrow(ind_stats_F9)` individuals.


### Filter 10: Missing data and minimum depth per locus

Filter loci with mean read depth < 15 and genotype call rate < 90%.

```{bash geno 90 meanDP 15}

vcftools --vcf data/VCF/temp/BMA.F9.recode.vcf --out data/VCF/temp/BMA.F10 --min-meanDP 15 --max-missing 0.9 --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA.F10.recode.vcf --out data/VCF/BMA.F10 --depth
vcftools --vcf data/VCF/temp/BMA.F10.recode.vcf --out data/VCF/BMA.F10 --site-mean-depth
vcftools --vcf data/VCF/temp/BMA.F10.recode.vcf --out data/VCF/BMA.F10 --missing-indv
vcftools --vcf data/VCF/temp/BMA.F10.recode.vcf --out data/VCF/BMA.F10 --missing-site
vcftools --vcf data/VCF/temp/BMA.F10.recode.vcf --out data/VCF/BMA.F10 --het
vcftools --vcf data/VCF/temp/BMA.F10.recode.vcf --out data/VCF/BMA.F10 --hardy

```

### Filter 11: Excess heterozygosity

Identify SNPs with more than 0.5 heterozygosity and significant.

```{r}

# calculate observed heterozygosity
hwe <- read.table("data/VCF/BMA.F10.hwe", 
                  stringsAsFactors = FALSE, header = TRUE) %>%
  select(-ChiSq_HWE) %>%
  separate(`OBS.HOM1.HET.HOM2.`, 
           into = c("obs_hom1", "obs_het", "obs_hom2"), 
           sep = "/", convert = TRUE) %>%
  separate(`E.HOM1.HET.HOM2.`, 
           into = c("exp_hom1", "exp_het", "exp_hom2"), 
           sep = "/", convert = TRUE) %>%   
  mutate(Ho = obs_het/(obs_hom1 + obs_hom2 + obs_het),
         He = exp_het/(exp_hom1 + exp_hom2 + exp_het)) %>%
  select(CHR, POS, obs_hom1, exp_hom1, obs_hom2, exp_hom2, P_HET_DEFICIT, obs_het, exp_het, P_HET_EXCESS, Ho, He, P_HWE)

```

Distribution of observed heterozygosity.

```{r fig.height=3, fig.width=4}

ggplot(hwe, aes(x = Ho)) + 
  geom_histogram(binwidth = 0.05, color = "black", fill = "darkorange") +
  geom_vline(xintercept = 0.5, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "observed heterozygosity") +
  theme_standard

```

Identify loci with heterozygosity > 0.5, then correct p-values for multiple comparisons using Benjamini-Hochberg method.

```{r}

# identify contigs with SNPs with Ho > 0.5 & significant
excess_hwe <- hwe %>%
  filter(Ho > 0.5) %>%
  mutate(p_adj = p.adjust(P_HET_EXCESS), method = "fdr") %>%
  filter(p_adj < 0.01)

contigs <- hwe %>%
  filter(CHR %in% excess_hwe$CHR) %>%
  select(CHR) %>%
  unique()

# identify all SNPs on contigs
SNPs <- hwe %>%
  filter(CHR %in% contigs$CHR) %>%
  select(CHR, POS)

# View(contigs)

# Write contig/position to text file, use file with vcftools to remove positions from dataset 
write.table(SNPs, file = "data/VCF/hetexcess.loci", 
            col.names= FALSE, row.names = FALSE, quote = FALSE)

```

Remove SNPs with excess heterozygosity and contigs with more than one SNP w/excess heterozygosity.

```{bash}

vcftools --vcf data/VCF/temp/BMA.F10.recode.vcf --out data/VCF/temp/BMA.F11 --exclude-positions data/VCF/hetexcess.loci --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA.F11.recode.vcf --out data/VCF/BMA.F11 --depth
vcftools --vcf data/VCF/temp/BMA.F11.recode.vcf --out data/VCF/BMA.F11 --site-mean-depth
vcftools --vcf data/VCF/temp/BMA.F11.recode.vcf --out data/VCF/BMA.F11 --missing-indv
vcftools --vcf data/VCF/temp/BMA.F11.recode.vcf --out data/VCF/BMA.F11 --missing-site
vcftools --vcf data/VCF/temp/BMA.F11.recode.vcf --out data/VCF/BMA.F11 --het
vcftools --vcf data/VCF/temp/BMA.F11.recode.vcf --out data/VCF/BMA.F11 --geno-depth

```

Visualize stats:

```{r stats F12, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F11 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F11") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_F11 <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.F11")

```

Data set contains `r nrow(loc_stats_F11)` SNP sites and `r nrow(ind_stats_F11)` individuals.

### Filter 12: Filter LQ individuals

Compare mean depth and number of sites called per individual.

```{r idepth sites vs depth, fig.height=3, fig.width=5, message=FALSE, warning=FALSE}

read_table2("data/VCF/BMA.F11.idepth") %>%
  ggplot(aes(x = N_SITES, y = MEAN_DEPTH)) +
  geom_point(shape = 1, size = 1) +
  labs(x = "number of sites", y = "mean read depth") +
  theme_standard

```

Use genotype depth file to identify individuals with high variance in read depth across loci.

```{r variance depth individual}

# number of individuals
n <- nrow(ind_stats_F11)+1

# read genotype depth & code values < 5 as missing, retain only one locus per contig
gdepth <- read_table2("data/VCF/BMA.F11.gdepth") %>%
  select(-POS) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  gather(key = INDV, value = GDEPTH, 2:n) %>%
  mutate(GDEPTH = as.numeric(gsub(-1, 0, GDEPTH))) %>%
  mutate(DEPTH = GDEPTH) %>%
  mutate(DEPTH = as.numeric(gsub("\\<1\\>", 0, DEPTH))) %>%
  mutate(DEPTH = as.numeric(gsub("\\<2\\>", 0, DEPTH))) %>%
  mutate(DEPTH = as.numeric(gsub("\\<3\\>", 0, DEPTH))) %>%
  mutate(DEPTH = as.numeric(gsub("\\<4\\>", 0, DEPTH))) %>%
  mutate(MISSING = ifelse(DEPTH == 0, "missing", "genotyped")) %>%
  mutate(GDEPTHBIN = cut(GDEPTH, 
                         breaks = c(0, 5, 10, 25, 50, 100, 500, max(GDEPTH)),
                         labels = c("<5", "3-10", "10-25", "25-50", "50-100", "100-500", ">500"))) %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE, extra = "merge") %>%
  mutate(LIB = str_replace_all(LIB, "1A.*", "BMA1"),
         LIB = str_replace_all(LIB, "1B.*", "BMA1"),
         LIB = str_replace_all(LIB, "2A.*", "BMA2"),
         LIB = str_replace_all(LIB, "2B.*", "BMA2"),
         LIB = str_replace_all(LIB, "3A.*", "BMA3"),
         LIB = str_replace_all(LIB, "3B.*", "BMA3"))

gdepth <- left_join(gdepth, SampleInfo, by = "SAMPLE_ID")

```

Compare distribution of depth.

```{r genotype depth heatmap pop, fig.height=20, fig.width=20}

# would need to join with Sample Info to get pop specifications
ggplot(gdepth, aes(x = INDV, y = CHROM)) +
  geom_tile(aes(fill = GDEPTHBIN)) +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = 'genotype depth',
                     discrete = TRUE) +
  facet_grid(. ~ POP, space = "free", scales = "free", drop = TRUE) +
  labs(x = "individual", y = "locus") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "bottom")

```

Compare distribution of depth of individuals grouped by library.

```{r genotype depth heatmap lib, fig.height=20, fig.width=20}

ggplot(gdepth, aes(x = INDV, y = CHROM)) + 
  geom_tile(aes(fill = GDEPTHBIN)) + 
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = 'genotype depth',
                     discrete = TRUE) +
  facet_grid(. ~ LIB, space = "free", scales = "free", drop = TRUE) +
  labs(x = "individual", y = "locus") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "bottom")

```

Compare distribution of genotype depths (across all individuals).

```{r distribution genotype depths, fig.height=3.5, fig.width=5.5, message=FALSE, warning=FALSE}

temp <- count(gdepth, GDEPTHBIN)

ggplot(temp, aes(x = GDEPTHBIN, y = n)) +
  geom_bar(stat= "identity", color = "black", fill = "darkorange") +
  theme_standard

```

Identify variance in depth w/in individuals.

```{r, fig.height=10, fig.width=9}

# calculate summary statistics
idepth <- gdepth %>%
  group_by(INDV) %>%
  summarize(TOTAL = sum(DEPTH),
            MAX = max(DEPTH),
            MEAN = mean(DEPTH),
            MEDIAN = median(DEPTH),
            VAR = var(DEPTH),
            STD = sd(DEPTH)) %>%
  mutate(COEFF_VAR = STD/MEAN,
         RATIO_MEAN_MEDIAN = MEAN/MEDIAN)

p1 <- ggplot(idepth, aes(x = MEAN, y = MEDIAN)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p2 <- ggplot(idepth, aes(x = MAX, y = TOTAL)) +
  geom_smooth(method = "lm",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p3 <- ggplot(idepth, aes(x = TOTAL, y = VAR)) +
  geom_smooth(method = "auto",linetype = "dashed", size = 1, color = "darkred") +
  geom_point(shape = 1) +
  theme_standard

p4 <- ggplot(idepth, aes(MEDIAN)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  theme_standard

p5 <- ggplot(idepth, aes(RATIO_MEAN_MEDIAN)) +
  geom_histogram(binwidth = 0.1, color = "black", fill = "darkorange") +
  theme_standard

p6 <- ggplot(idepth, aes(COEFF_VAR)) +
  geom_histogram(binwidth = 0.05, color = "black", fill = "darkorange") +
  theme_standard

m14b <- multiplot(p1, p2, p3, p4, p5, p6, cols=2)

```

Compare depth, missing data and individual heterozygosity levels.

```{r istats, fig.height=18, fig.width=10, message=FALSE, warning=FALSE}

imiss <- read_table2("data/VCF/BMA.F11.imiss") %>%
  select(INDV, N_DATA, F_MISS)

istats <- left_join(idepth, imiss)

Fis <- read_table2("data/VCF/BMA.F11.het") %>%
  select(-N_SITES)

istats <- left_join(istats, Fis)

p1 <- ggplot(istats, aes(x = MEAN, y = F_MISS, fill = `F`)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = 1, option = "viridis",
                     name = 'Fis', discrete = FALSE) +
  theme_standard

p2 <- ggplot(istats, aes(x = COEFF_VAR, y = MEAN, fill = F_MISS)) +
 geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  theme_standard

p3 <- ggplot(istats, aes(x = `O(HOM)`, y = MEAN, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = 'mean read depth', discrete = FALSE) +
  theme_standard

p4 <- ggplot(istats, aes(x = MEAN)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  theme_standard

p5 <- ggplot(istats, aes(x = `F`, y = MEAN, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  theme_standard

p6 <- ggplot(istats, aes(x = `F`, y = COEFF_VAR, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  theme_standard

p7 <- ggplot(istats, aes(x = `F`, y = RATIO_MEAN_MEDIAN, fill = F_MISS)) +
  geom_point(shape = 21, size = 2, color = "black") +
  scale_fill_viridis(direction = -1, option = "viridis",
                     name = '% missing', discrete = FALSE) +
  scale_y_continuous(limits = c(0.8, 4)) +
  theme_standard

p8 <- ggplot(istats, aes(x = F_MISS)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  theme_standard


m14b <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, cols=2)

```

Flag low quality individuals

```{r}

# View(istats)

LQ_ind <- istats %>%
   filter(`F` > .9 | `F` < -.25) %>%
   select(INDV)

write.table(LQ_ind, "data/VCF/F12_LQ.indv",
            col.names = TRUE, row.names = FALSE, quote = FALSE)

```

### Filter 13: Missing data by sample location

```{bash missing by sample locat}

vcftools --vcf data/VCF/temp/BMA.F11.recode.vcf --out data/VCF/temp/BMA.F12 --remove data/VCF/F12_LQ.indv --recode --recode-INFO-all

# Campeche
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/CAMP --keep data/VCF/CAMP.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/CAMP.recode.vcf --out data/VCF/CAMP --missing-site

# Chandeleur Sound
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/CS --keep data/VCF/CS.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/CS.recode.vcf --out data/VCF/CS --missing-site

# Corpus Christi Bay
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/CC --keep data/VCF/CC.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/CC.recode.vcf --out data/VCF/CC --missing-site

# Florida Atlantic
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/FLA --keep data/VCF/FLA.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLA.recode.vcf --out data/VCF/FLA --missing-site

# Florida Gulf South
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/FLGS --keep data/VCF/FLGS.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLGS.recode.vcf --out data/VCF/FLGS --missing-site

# Florida Gulf North 
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/FLGN --keep data/VCF/FLGN.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/FLGN.recode.vcf --out data/VCF/FLGN --missing-site

# Louisiana
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/LA --keep data/VCF/LA.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/LA.recode.vcf --out data/VCF/LA --missing-site

# Mississippi Sound 
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/MISS --keep data/VCF/MISS.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/MISS.recode.vcf --out data/VCF/MISS --missing-site

# Mobile Bay
vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/MB --keep data/VCF/MB.ind  --recode --recode-INFO-all
vcftools --vcf data/VCF/temp/MB.recode.vcf --out data/VCF/MB --missing-site

```

Compare missing data per sample region:

```{r compare missing by locat, fig.height=12, fig.width=3, message=FALSE, warning=FALSE}

# pops to loop over
pop <- unique(Ind_RAW$POP)

loc_missing <- list()

# import missing data per locus
for (p in pop) {
  
  loc_missing[[p]] <- read_delim(paste("data/VCF/", p, ".lmiss", sep = ""), delim = "\t") %>%
    select(CHR, POS, F_MISS) %>%
    mutate(POP = p)
  
}

# create data frame with all information
loc_missing <- ldply(loc_missing, data.frame)

ggplot(loc_missing, aes(x = F_MISS)) +
  geom_histogram(binwidth = .05, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = 0.15),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "missing data per locus") +
  scale_y_sqrt() +
  facet_grid(POP ~ .) +
  theme_standard

```

identify loci with high missing data in each sample location (> 15 % missing data in a given location).

```{r}
# identify loci with high missing data in each region
SNPs <- filter(loc_missing, F_MISS > 0.15) 

count(SNPs, POP)

contigs <- SNPs %>%
  distinct(CHR)

SNPs <- SNPs %>%
  select(CHR, POS)

# Write contig/position to text file, use file with vcftools to remove positions from dataset 
write.table(SNPs, file = "data/VCF/LQ_F13.loci", 
            col.names= FALSE, row.names = FALSE, quote = FALSE)
```

Remove loci from data set:

```{bash}

vcftools --vcf data/VCF/temp/BMA.F12.recode.vcf --out data/VCF/temp/BMA.F13 --exclude-positions data/VCF/LQ_F13.loci --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --site-quality
vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --depth
vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --site-mean-depth
vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --missing-indv
vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --missing-site
vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --het
vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --geno-depth
vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --singletons
vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --indv-freq-burden
vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/BMA.F13 --freq

```

Compare stats:

```{r stats F13, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_F13 <- read.ind.stats(dir = "data/VCF", vcf = "BMA.F13") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE, extra = "merge")

loc_stats_F13 <- read.loc.stats(dir = "data/VCF", vcf = "BMA.F13")

# plot missing data per indv ----
p1 <- ggplot(ind_stats_F13, aes(x = MISS_BMA.F13)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot read depth per indv ----
p2 <- ggplot(ind_stats_F13, aes(x = MEAN_DEPTH_BMA.F13)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p3 <- ggplot(ind_stats_F13, aes(x = MEAN_DEPTH_BMA.F13, y = MISS_BMA.F13)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot Fis per indv ----
p4 <- ggplot(ind_stats_F13, aes(x = Fis_BMA.F13)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Fis_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv") +
  theme_standard

# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_F13, aes(x = Fis_BMA.F13, y = MISS_BMA.F13)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "% missing data") +
  theme_standard

# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_F13, aes(x = Fis_BMA.F13, y = MEAN_DEPTH_BMA.F13)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "mean depth per indv") +
  theme_standard


# plot distribution missing data per locus ----
p7 <- ggplot(loc_stats_F13, aes(x = MISS_BMA.F13)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_F13, aes(x = MEAN_DEPTH_BMA.F13)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_F13, aes(x = MEAN_DEPTH_BMA.F13, y = MISS_BMA.F13)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.F13, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/BMA.F13.lqual", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(PROB = 10^(-QUAL/10))

temp <- data.frame(loc_stats_F13$MEAN_DEPTH_BMA.F13, site_qual$QUAL) %>%
  rename(depth = loc_stats_F13.MEAN_DEPTH_BMA.F13, qual = site_qual.QUAL)

p10 <- ggplot(temp, aes(x = depth, y = qual)) +
  geom_point(size = 1) +
  geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "SNP quality") +
  theme_standard

# plot number of SNPs per contig vs. mean depth ----
temp <- loc_stats_F13 %>%
  count(CHR)

p11 <- left_join(temp, loc_stats_F13) %>%
  ggplot() +
  geom_point(aes(x = n, y = MEAN_DEPTH_BMA.F13)) +
  labs(x = "number of SNPs per contig", y = "mean depth") +
  theme_standard

# plot no of SNPs per locus ----
p12 <- loc_stats_F13 %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  labs(x = "number of SNPs per locus") +
  theme_standard

m15 <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)

```

Compare number of contigs vs number of SNPs.

```{r}

# number of SNPs
nrow(loc_stats_F13)

# number of contigs in data set
contigs <- loc_stats_F13 %>%
  distinct(CHR)

nrow(contigs)

```

Mean number of SNPs per contig is `r nrow(loc_stats_F13)/nrow(contigs)`.

```{r}

istats <- left_join(ind_stats_raw, ind_stats_F13)

```


## Final thresholds values for filtered data set BMA:

* minimum sequence quality (minQ): 20
* minimum genotype quality (minGQ): 20
* minimum genotype call rate per locus (geno): 90%
* minimum genotype depth (minD): 5
* maximum missing data per individual (missInd): 25%
* mean minimum depth per locus (m-minD): 15
* mean maximum depth (m-minD): 300
* dDocent filters run for allelic balance, mapping quality, strandedness and paired status, depth/quality ratio
* Indels removed from data set
* minimum genotype call rate by population: 85%

Data set contains `r nrow(loc_stats_F13)` SNP sites on `r nrow(contigs)` and `r nrow(ind_stats_F13)` individuals.

# Haplotyping

## Prep for haplotyping

Create list of individuals retained in final vcf file:

Change so that new file is printed into Haplotyping/temp folder

```{bash}

vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/Haplotyping/temp/BMA --recode --recode-INFO-all

vcfsamplenames data/Haplotyping/temp/BMA.recode.vcf > data/Haplotyping/temp/BMA.individuals

```

Use `BMA-BMA.individuals` to create `popmap` as a tab-separated file of individuals and their population designation, with one individual per line (make sure UNIX format). This file is needed to write the genepop file, if not provided the script will run through the process but not write a genepop file and place into same folder `rad_haplotyper.pl` will be run from.

```{r}

popmap <- read.table("data/Haplotyping/temp/BMA.individuals", 
                     col.names = "INDV", stringsAsFactors = FALSE) %>%
  separate(INDV, into = c("POP", "LIB", "ID"), sep = "_", remove = FALSE, extra = "merge") %>%
  select(INDV, POP)

write.table(popmap, "data/Haplotyping/temp/popmap", 
            col.names = FALSE, row.names = FALSE, quote = FALSE)

```

Place all necessary files in `data/Haplotyping/temp` directory (bam, fastq, reference.fasta, vcf-file)

```{bash, eval=FALSE, include=FALSE}

ln -s /home/soleary/GAFFTOPS/BMA_POPGEN/data/SEQ/BMA-1/*.fq.gz /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp
ln -s /home/soleary/GAFFTOPS/BMA_POPGEN/data/SEQ/BMA-1/*.bam* /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp
ln -s /home/soleary/GAFFTOPS/BMA_POPGEN/data/SEQ/BMA-2/*.fq.gz /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp
ln -s /home/soleary/GAFFTOPS/BMA_POPGEN/data/SEQ/BMA-2/*.bam* /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp
ln -s /home/soleary/GAFFTOPS/BMA_POPGEN/data/SEQ/BMA-3/*.fq.gz /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp
ln -s /home/soleary/GAFFTOPS/BMA_POPGEN/data/SEQ/BMA-3/*.bam* /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp

```

## Run Haplotyper

```{bash, eval=FALSE, include=FALSE}

# difficulty running from R
rad_haplotyper.pl -v BMA.recode.vcf -r reference.fasta -x 35 -d 10 -z 3 -m .25 -g BMA.gen -p popmap

```

Copy files for analysis into `data/Haplotyping/`-folder

```{bash}

cd /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp
rm *.fq.gz
rm *.bam*

cp /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp/ind_stats.out /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/

cp /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp/stats.out /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/

cp /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/temp/BMA.gen /home/soleary/GAFFTOPS/BMA_POPGEN/data/Haplotyping/

```

# Haplotype filtering

## Overview of haplotyping success

Comparison of the number of loci that were filtered due to excess number of missing data or suspected paralogs during the haplotyping process to those that passed. Haplotyer was run without any threshold values (other than default values). The genepop file only contains loci that passed haplotyping stage.

```{r import stats.out file, fig.height=3, fig.width=3, message=FALSE, warning=FALSE}

# import stats files generated by haplotyper
hap_stats <- read.hap.stats("data/Haplotyping/stats.out")



hap_ind_stats <- read.delim("data/Haplotyping/ind_stats.out",
                           stringsAsFactors = FALSE)

temp <- hap_ind_stats %>%
  rename(INDV = Ind)

istats <- left_join(istats, temp)

# view comparison of filtered vs. passed loci
plot.filter.status(hap_stats)

```

Remove filtered loci from data set.

```{r}

count(hap_stats, Status == "FILTERED")

hap_stats <- remove.filtered.haps(hap_stats)

```

`r nrow(hap_stats)` loci passed haplotyping process.

```{r plot stats.out file, fig.height=6, fig.width=9, message=FALSE, warning=FALSE}

plot.hap.stats(hap_stats)

```

Data set contains `r nrow(hap_ind_stats)` individuals:

```{r plot ind.stats.out file, fig.height=6, fig.width=9, message=FALSE, warning=FALSE}

plot.ind.hap.stats(hap_ind_stats)

```

## Identify threshold values to filter data set

### Proportion of individuals haplotyped

```{r prop indv haplotyped, fig.height=4, fig.width=13, message=FALSE, warning=FALSE}

# hap_stats <- read.hap.stats("data/Haplotyping/stats.out")

p1 <- ggplot(hap_stats, aes(x = Prop_Haplotyped)) +
  geom_histogram(binwidth = 0.025, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Prop_Haplotyped, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.9),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "proportion individuals haplotyped") +
  theme_standard

p2 <- ggplot(hap_stats, aes(x = Prop_Haplotyped, y = Poss_Paralog)) +
  geom_point(shape = 21, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Prop_Haplotyped, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "proportion individuals haplotyped", y = "possible paralogs") +
  theme_standard

p3 <- ggplot(hap_stats, aes(x = Prop_Haplotyped, y = `Low_Cov.Geno_Err`)) +
  geom_point(shape = 21, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Prop_Haplotyped, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "proportion individuals haplotyped", y = "low coverage error") +
  theme_standard

multiplot(p1, p2, p3, cols = 3)

```

Flag loci successfully haplotyped in < 90% of individuals.

```{r filter loci genotype call rate, message=FALSE, warning=FALSE}

# number of loci called in >95% individuals
count(hap_stats, Prop_Haplotyped < 0.9)

# create vector of loci to remove (choose cut-off)
Prop_Haplotyped <- filter.loci.prop_haplotyped(hap_stats, 0.9)

```

`r length(Prop_Haplotyped)` loci were flagged as genotype call rate < 0.9.

### Possible paralogs per locus

Loci are flagged as possible paralogs for an individuals when more than the expected number of haplotypes based on the SNP genotype call (homozygote, heterozygote) are detected.

```{r poss paralogs per locus, fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

ggplot(hap_stats, aes(x = Poss_Paralog)) +
  geom_histogram(binwidth = 2, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Poss_Paralog, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = (nrow(hap_ind_stats)*0.01)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "possible paralogs per locus") +
  theme_standard

```

1 % of individuals: `r nrow(hap_ind_stats)*0.01` 
5 % of individuals: `r nrow(hap_ind_stats)*0.05` 

Flag loci that are flagged as potential paralogs in 5 or more individuals.
4
```{r filter loci paralogs}

# number of loci with Poss_Paralogs
count(hap_stats, Poss_Paralog > 4)

# create vector of loci to remove (choose cut-off)
Poss_Paralogs <- filter.loci.paralogs(hap_stats, 4)

```

`r length(Poss_Paralogs)` loci were flagged as possible paralogs.

### Number of SNPs & Haplotypes per locus

Each locus varies in the number of SNPs detected which determines the number of haplotypes expected in that population. 

```{r no SNPs, fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

ggplot(hap_stats, aes(x = Sites)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Sites, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "number of SNPs per contig") +
  theme_standard

```

Filtering loci based on number of SNPs contained on that locus could bias the data set as loci with high recombination may be removed. On the other hand, assuming an approximate length of 250 bp loci with more than `r 250*0.1` SNPs would mean that 10% of bases are a polymorphisms.

```{r sites vs paralogs, fig.height=3, fig.width=8}

p1 <- ggplot(hap_stats, aes(x = Haplotypes, y = Poss_Paralog)) +
  geom_point(shape = 21, color = "black", fill = "darkorange") +
  geom_smooth(method = "lm") +
  labs(x = "no. haplotypes", y = "possible paralogs") +
  theme_standard

p2 <- ggplot(hap_stats, aes(x = Sites, y = Poss_Paralog)) +
  geom_point(shape = 21, color = "black", fill = "darkorange") +
  geom_smooth(method = "lm") +
  labs(x = "no. of sites", y = " ") +
  theme_standard
  
multiplot(p1, p2, cols = 2)

```

Identify number of haplotypes loci with > 35 SNP sites.

```{r filter no of SNPs}

count(hap_stats, Sites > 35)

NoSNps <- filter.loci.noSNPs(hap_stats, 35)

```

`r length(NoSNps)` loci were flagged.

Assuming that mutation is the only mechanism resulting in new haplotypes, the maximum expected number of haplotypes per locus is number of SNPs N + 1.

```{r xtra haplotypes, fig.height=12, fig.width=10, message=FALSE, warning=FALSE}

p1 <- ggplot(hap_stats, aes(x = Haplotypes)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Haplotypes, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(Haplotypes, .99, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "# haplotypes per locus", y = "# loci") +
  theme_standard

temp <- hap_stats %>%
  select(Locus, Sites, Haplotypes, Prop_Haplotyped, Low_Cov.Geno_Err, Poss_Paralog) %>%
  mutate(exp_sites = Sites + 1) %>%
  mutate(xtra = Haplotypes - Sites)

p2 <- ggplot(temp, aes(x = Sites, y = Poss_Paralog)) +
  geom_point() +
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# SNP sites per locus", y = "possible paralogs") +
  theme_standard

p3 <- ggplot(temp, aes(x = Sites, y = xtra)) +
  geom_point() +
  geom_hline(yintercept = 10, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# SNP sites per locus", y = "# extra haplotypes") +
  theme_standard

p4 <- ggplot(temp, aes(x = xtra, y = Prop_Haplotyped)) +
  geom_point(size = 1) +
  geom_hline(yintercept = 0.9, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# extra haplotypes", y = "proportion of indv haplotyped") +
  theme_standard

p5 <- ggplot(temp, aes(x = xtra, y = Low_Cov.Geno_Err)) +
  geom_point(size = 1) +
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# extra haplotypes", y = "potential genotyping error") +
  theme_standard

p6 <- ggplot(temp, aes(x = xtra, y = Poss_Paralog)) +
  geom_point(size = 1) +
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(xintercept = 25, color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# extra haplotypes", y = "possible paralogs") +
  theme_standard

m21 <- multiplot(p1, p2, p3, p4, p5, p6, cols = 2)

```

Again, removing loci with unexpectedly high numbers of haplotypes may bias the data set, as loci with e.g. high recombination may be removed from the data set.

```{r filter no of haplotypes}

count(temp, xtra >= 50)

# create vector of loci to remove
NoHaps <- filter(temp, xtra >= 50) 

NoHaps <- NoHaps$Locus

```

`r length(NoHaps)` loci were flagged for excessive number of haplotypes compared to the number of SNPs at that locus.

### Low Coverage/Genotyping Error

After combining SNPs on the same contig during the haplotyping process, it is possible to observe fewer than the expected number of haplotypes based on the genotype call (heterozygote/homozygote). When this occurs, that genotype is coded as missing. For each locus the number of individuals for which is it flagged as a potential a potential genotyping error or suffering from null alleles due to low coverage detected for a given locus is recorded. 

```{r low cov genotypg error, fig.height=4, fig.width=5, message=FALSE, warning=FALSE}

ggplot(hap_stats, aes(x = Low_Cov.Geno_Err)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Low_Cov.Geno_Err, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = (nrow(hap_ind_stats)*0.05)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "potential genotyping error") +
  theme_standard

```

Loci with pronounced patterns of genotyping error likely due to low coverage will have low success rate for genotyping, i.e. they will be caught in missing data filters. Coverage issues are likely genotype specfic; previous filters have targeted loci and individuals with high variance in coverage and suspect genotpes have been coded as missing, i.e. this filter need not be very conservative, regardless, loci that are repeatedly being flagged as problematic should be removed.

```{r filter low coverage/genotyping error, message=FALSE, warning=FALSE}

# summary of count of possible genotyping errors
count(hap_stats, Low_Cov.Geno_Err > 10)

# create vector of loci to remove
Geno_Err <- filter.loci.geno_err(hap_stats, 10)

```

`r length(Geno_Err)` loci were flagged as potentially affected by genotyping error.

### Flag problematic individuals

Loci that are not successfully haplotyped in an individual due to missing data, complex locus, haplotyped genotype is higher/lower than SNP haplotype in a given individual are coded as missing for that individual. Problematic individual can be identified as having a high proportion of missing data.

**Individuals with low proportion of successfully haplotyped loci**

```{r missing indv, fig.height=4, fig.width=5, message=FALSE, warning=FALSE}

ggplot(hap_ind_stats, aes(x = Prop_Success)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Prop_Success, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.75),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "proportion loci successfully haplotyped") +
  theme_standard

```

Problem individuals can be identified by choosing a cut-off value for the minium proportion of sucessfully haplotyped loci and excluding individuals below that threshold value. Removing loci with low haplotyping success with decrease the amount of missing data. Therefore, final missing data cut-offs should be applied *after* removing those loci from the data set, though it is important to flag potentially problematic individuals based on their haplotyping stats at this point.

```{r filter success haplotyped indv}

# count individuals above set threshold
count(hap_ind_stats, Prop_Success < .75)

# create vector of indv to remove (choose cut-off)
PropSuccessInd <- filter.ind.prop_success(hap_ind_stats, .75)

write.table(PropSuccessInd, "data/POPGEN/propSucc.indv", 
            col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")

# View(as.data.frame(PropSuccessInd))

```

**Possible paralogs per individual**

Individuals with high proportion of loci flagged as possible paralogs should be flagged as potential problem individuals. 

```{r poss paralogs per indv, fig.height=4, fig.width=5, message=FALSE, warning=FALSE}

ggplot(hap_ind_stats, aes(x = Poss_Paralogs)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Poss_Paralogs, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = (nrow(hap_stats)*0.01)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "no of loci potential paralogs") +
  theme_standard

```

Cut-off for individuals with loci flagged paralogs in > 1% of loci is `r nrow(hap_stats)*0.01`.

```{r filter paralogs indv}

# count individuals above set threshold
count(hap_ind_stats, Poss_Paralogs > 100)

# create vector of indv to remove (choose cut-off)
Poss_ParalogInd <- filter.ind.paralogs(hap_ind_stats, 100)

write.table(Poss_ParalogInd, "data/POPGEN/possParal.indv", 
            col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")

```

The highest number of flagged loci in an individuals is `r max(hap_ind_stats$Poss_Paralogs, na.rm = TRUE)`, which is equivalent to `r round(max(hap_ind_stats$Poss_Paralogs, na.rm = TRUE)/nrow(hap_stats), digits = 4)*100`% of loci.

**Individuals with high proportion of potential allele dropout/genotyping error**

```{r genotypg error per indv, fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

ggplot(hap_ind_stats, aes(x = Low_Coverage.Errors)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Low_Coverage.Errors, na.rm = TRUE)),
                 color = "darkred", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = (nrow(hap_stats)*0.01)),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "# loci potential genotyping error", y = "# indv") +
  theme_standard

```

Remove individuals with high proportion of loci that have been flagged as potential allele dropouts/genotyping error.

```{r filter genotyping error indv}

# count individuals above set threshold
count(hap_ind_stats, Low_Coverage.Errors > 285)

# create vector of indv to remove (choose cut-off)
GenoErrIndv <- filter.ind.geno_err(hap_ind_stats, 285)

write.table(GenoErrIndv, "data/POPGEN/GenoErr.indv", 
            col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")

# View(GenoErrIndv)

```

The highest number of flagged loci in an individuals is `r max(hap_ind_stats$Low_Coverage.Errors, na.rm = TRUE)`, which is equivalent to `r round(max(hap_ind_stats$Low_Coverage.Errors, na.rm = TRUE)/nrow(hap_stats), digits = 4)*100`% of loci.

## Filter flagged loci and individuals as necessary

Load genepop file with haplotyped loci.

```{r load genepop, message=FALSE, warning=FALSE}

# import genepop file
gen <- read.genepop(file = "data/Haplotyping/BMA.gen",
                    ncode = 3L, quiet = FALSE)

gen

```

Remove flagged loci and individuals.

```{r remove loci individuals, message=FALSE, warning=FALSE}

# remove flagged loci
removeloc <- c(Poss_Paralogs, NoSNps, NoHaps, Geno_Err, Prop_Haplotyped)

gen <- genind.rem.loci(gen, removeloc)

# remove flagged individuals
removeInd <- c(Poss_ParalogInd, PropSuccessInd, GenoErrIndv,
               "BMA_1A-H02_MS18", "BMA_1B-C05_MS-22", "BMA_1B-C03_MS-20", "BMA_3A-D01_BM006")
gen <- gen.ind.rem.Ind(gen, removeInd)

gen

```

# Compare duplicate individuals

```{r comp dulplicate indv, message=FALSE, warning=FALSE}

# extract genotype matrix
df <- genind2df(gen,
                usepop = TRUE,
                sep = ":", oneColPerAll = FALSE) %>%
  select(-pop) %>%
  rownames_to_column()

# create list of duplicates
pairs <- list()

# input each set of duplicates as a vector of a list
pairs[[1]] <- c("BMA_3B-E12_MS-37", "BMA_2A-C01_MS-57")
pairs[[2]] <- c("BMA_1A-C01_CCGN-001", "BMA_1A-D02_CCGN01")
pairs[[3]] <- c("BMA_1A-H04_CC02-35",	"BMA_2B-B09_CC-35")
pairs[[4]] <- c("BMA_3B-H02_MS26", "BMA_3B-D01_MS26")
pairs[[5]] <- c("BMA_2B-H02_MB06", "BMA_1B-E06_MB-06")
pairs[[6]] <- c("BMA_1B-G11_CC-08", "BMA_1A-H03_CC02-08")
pairs[[7]] <- c("BMA_3B-H03_BM027", "BMA_3B-A07_BM027")
pairs[[8]] <- c("BMA_1B-C04_MS-21",	"BMA_2A-C05_MS-61")
pairs[[9]] <- c("BMA_2B-H01_CS3", "BMA_1A-B03_CS-03")
pairs[[10]] <- c("BMA_3B-E11_MS-36", "BMA_1B-C12_MS-56")
pairs[[11]] <- c("BMA_2A-C07_MS-63", "BMA_1B-C06_MS-23")
pairs[[12]] <- c("BMA_2B-B01_CC-12", "BMA_3A-A02_CC-12")
pairs[[13]] <- c("BMA_2B-G05_BMAR-03", "BMA_3B-E12_BMAR-02")


# create empty list for genotype error (discordant loci)
genoerror <- list()

# create empty vector for duplicates (retains first indv in pair)
dup <- character()

# identify discordant genotypes for each set of duplicates
for (p in pairs){

# select duplicates from genotype matrix
geno <- df %>%
  filter(rowname %in% p) %>%
  select(-rowname)

# compare genotypes
comp <- (t(geno))

contigs <- as.data.frame(comp) %>%
  rownames_to_column(var = "LOCUS") %>%
  mutate(V1 = as.character(V1),
         V2 = as.character(V2)) %>%
  filter(V1 != V2)

# write vector with first individual in pair
ind <- p[1]

dup <- c(dup, ind)

genoerror[[ind]] <- contigs

}

# if it throws error object V1 or V2 not found it means one or more of the samples names are not correct or no longer in data set after filtering

# combine into one dataframe
genoerror <- ldply(genoerror, data.frame) %>%
  rename(DUP1 = `.id`,
         GENO_INDV1 = V1,
         GENO_INDV2 = V2)

write_delim(genoerror, "results/haplotyped.genoerror", delim = "\t")

```

Compare genotype depth for discordant loci:

```{r message=FALSE, warning=FALSE}

# import genotype depth keep one depth per contig
n <- nrow(ind_stats_F13) + 1

gdepth <- read_table2("data/VCF/BMA.F13.gdepth") %>%
  select(-POS) %>%
  distinct(CHROM, .keep_all = TRUE) %>%
  gather(key = INDV, value = GDEPTH, 2:n) %>%
  rename(LOCUS = CHROM)

# create dataframe with duplicates
dup_indv <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(dup_indv) <- c("DUP1", "DUP2")

for (p in pairs){

p <- as.data.frame(t(p))

colnames(p) <-  c("DUP1", "DUP2")

dup_indv <- bind_rows(dup_indv, p)
   
}

# add duplicate 2 to genoerror data frame
genoerror <- left_join(genoerror, dup_indv)

# add genotype depth information for duplicate 1
gdepth <- gdepth %>%
  rename(DUP1 = INDV)

genoerror <- left_join(genoerror, gdepth) %>%
  rename(GDEPTH1 = GDEPTH)

# add genotype depth information for duplicate 2
gdepth <- gdepth %>%
  rename(DUP2 = DUP1)

genoerror <- left_join(genoerror, gdepth) %>%
  rename(GDEPTH2 = GDEPTH)

# identify homozygote vs heterozygote genotypes
genoerror <- genoerror %>%
  separate(col = GENO_INDV1, into = c("GENO1a", "GENO1b"), sep = ":", remove = FALSE) %>%
  separate(col = GENO_INDV2, into = c("GENO2a", "GENO2b"), sep = ":", remove = FALSE) %>%
  mutate(GENOTYPE1 = ifelse(GENO1a == GENO1b, "HOMOZYGOTE", "HETEROZYGOTE"),
         GENOTYPE2 = ifelse(GENO2a == GENO2b, "HOMOZYGOTE", "HETEROZYGOTE"))

```

Determine number of times discordant genotype is due to heterozygote/homozygote.

```{r}

count(genoerror, GENOTYPE1 == GENOTYPE2)

```

Compare number of discordant genotype calls per duplicate set.

```{r, fig.height=3, fig.width=4}

total <- as.numeric(length(locNames(gen)))

# compar number of loci by pair
per_ind <- count(genoerror, DUP1) %>%
  mutate(freq = n/total*100)

ggplot(per_ind, aes(freq)) +
  geom_histogram(binwidth = 0.25, fill = "darkorange", color = "black") +
  scale_x_continuous(limits = c(0, 5)) +
  labs(x = "% genotyping error per indv") +
  theme_standard

```

Identify loci consistently affected by genotyping error

```{r fig.height=3, fig.width=4, message=FALSE, warning=FALSE}

total <- as.numeric(length(pairs))

# compare loci affected by genotyping error
per_loc <- count(genoerror, LOCUS) %>%
  mutate(freq = n/total*100) %>%
  rename(Locus = LOCUS)

per_loc <- left_join(per_loc, hap_stats)

ggplot(per_loc, aes(x = n)) +
  geom_histogram(binwidth = 1, fill = "darkorange", color = "black") +
  geom_vline(aes(xintercept = mean(n, na.rm = TRUE)),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "same locus affected in n indv") +
  theme_standard

```

```{r}

count(per_loc, n > 3)

```

Remove flagged loci and one individual per pair.

```{r}

# write vector with second individual in pair (lower quality)
rm <- c("BMA_2A-C01_MS-57", "BMA_1A-D02_CCGN01", "BMA_2B-B09_CC-35", "BMA_3B-D01_MS26", "BMA_1B-E06_MB-06",
        "BMA_1A-H03_CC02-08", "BMA_3B-A07_BM027", "BMA_2A-C05_MS-61", "BMA_1A-B03_CS-03", "BMA_1B-C12_MS-56",
        "BMA_1B-C06_MS-23", "BMA_3A-A02_CC-12", "BMA_3B-E12_BMAR-02")

# remove duplicates
removeInd <- rm

gen <- gen.ind.rem.Ind(gen, removeInd)

# remove flagged loci
removeloc <- filter(per_loc, n > 3)
gen <- genind.rem.loci(gen, removeloc)

gen

```

`r nrow(temp)` loci were removed as potentially affected by genotyping error.

# Relatedness & Inbreeding

Create input file fore `related` package.

```{r}

# need to change to R 32
library(related)

# create input file ----
df <- genind2df(gen, usepop = FALSE, oneColPerAll = TRUE) %>%
  rownames_to_column("INDV") %>%
  separate(INDV, into = c("POP", "LIB", "INDV"), sep = "_", extra = "merge") %>%
  select(-POP) %>%
  unite(IND, LIB, INDV, sep = "_")

# missing data must be 0
df[df=="NA"] <- 0

write.table(df, "scratch/related.input",
            row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE)

# import input file as list (gdata, nloci, nalleles, ninds, freqs)
genotypedata <- readgenotypedata("scratch/related.input")

```

Calculate pairwise relatedness according to Lynch & Ritland 1999; uses inbreeding estimates for each individuals to estimate relatedness.

```{r lynchrd, fig.height=8, fig.width=7}

# pairwise relatedness (Lynch & Ritland 1999)
relatedness_lynchrd <- coancestry(genotypedata$gdata,
                                  lynchrd = 1)

relatedn <- relatedness_lynchrd$relatedness %>%
  select(pair.no, ind1.id, ind2.id, lynchrd)

write_delim(relatedn, "results/pairwise_relatedness")

inbreed <- relatedness_lynchrd$inbreeding %>%
  select(ind.id, L3, LH) %>%
  rename(INDV = ind.id)

write_delim(inbreed, "results/inbreeding")

```

Distribution of levels of pairwise relatedness:

```{r lynchrd distrib relatedn, fig.height=3, fig.width=8}

ggplot(relatedn, aes(x = lynchrd)) +
  geom_histogram(binwidth = 0.0025, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(lynchrd, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(lynchrd, 0.95)),
             color = "darkred", linetype = "dashed", size = 1) +
  scale_y_sqrt() +
  labs(x = "relatedness", y = "number of pairs") +
  theme_standard

```

Distribution of levels of homozygosity:

```{r lynchrd inbreeding, fig.height=3, fig.width=4}

ggplot(inbreed, aes(x = L3)) +
  geom_histogram(binwidth = 0.005, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(LH, na.rm = TRUE)),
             color = "darkblue", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = quantile(LH, 0.95)),
             color = "darkred", linetype = "dashed", size = 1) +
  labs(x = "inbreeding coefficient", y = "individuals") +
  theme_standard

```

# Heterozygosity and HWE

Generate summary statistics

```{r, message=FALSE, warning=FALSE}

SampleInfo <- read_delim("data/POPGEN/SampleInfo.txt", delim = "\t") %>%
  distinct(SAMPLE_ID, .keep_all = TRUE)

Inds <- as.data.frame(indNames(gen)) %>%
  rename(LIB_ID = `indNames(gen)`) %>%
  separate(LIB_ID, into = c("SP", "WELL", "SAMPLE_ID"), sep = "_", extra = "merge", remove = FALSE)

strata <- left_join(Inds, SampleInfo) %>%
  distinct() %>%
  mutate(POP = ordered(POP, levels = pops)) %>%
  mutate(REGION = case_when(POP %in% c("FLA") ~ "SWATL",
                            POP %in% c("CAMP") ~ "SGULF",
                            POP %in% c("FLGS", "FLGN") ~ "EGULF",
                            POP %in% c("MB", "MISS", "CS", "LA") ~ "CGULF",
                            POP %in% c("CC") ~ "WGULF",),
         OCEAN = ifelse(POP == "FLA", "ATL", "GULF"))

strata(gen) <- strata

# define groups using strata information - need to add POP for missing ones
setPop(gen) <- ~POP

# generate genetic diversity stats
GenDiv <- adegenet::summary(gen)

dat <- hierfstat:::.genind2hierfstat(gen)
GenDiv2 <- basic.stats(dat)

```

Compare observed (Ho) and expected (He) heterozygosity for all individuals across all populations.

```{r}

# observed heterozygosity per locus
Ho <- as.data.frame(GenDiv$Hobs) %>% 
  rownames_to_column("LOCUS") %>%
  rename(Ho = `GenDiv$Hobs`)

# expected heterozygosity per locus
Hs <- as.data.frame(GenDiv$Hexp) %>% 
  rownames_to_column("LOCUS") %>%
  rename(Hexp = `GenDiv$Hexp`)

# expected and observed heterozysity per locus
Het <- left_join(Ho, Hs, by = "LOCUS")

```

Identify loci that are now monomorphic and remove from data set:

```{r}

monomorphic <- filter(Ho, Ho == 0)

monomorphic <- monomorphic$LOCUS

# remove flagged loci
removeloc <- c(monomorphic)
gen <- genind.rem.loci(gen, removeloc)

gen

```

Expected vs observed heterozygosity across all populations.

```{r fig.height=4, fig.width=4}

temp <- Het %>%
  filter(Ho != 0)

# plot Ho vs Hs across all individuals
ggplot(temp, aes(x = Ho, y = Hexp)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
  xlim(0, 1) +
  ylim(0, 1) +
  labs(x = "observed heterozygosity Ho", y = "within cluster diversity Hs (Hexp)") +
  theme_standard

```

Compare distribution of observed heterozygosity (Ho) in each putative population:

```{r fig.height=4, fig.width=4}

Ho_per <- as.data.frame(GenDiv2$Ho) %>%
  rownames_to_column("LOCUS") %>%
  rename(CAMP = `1`, FLGS = `2`, FLGN = `3`, FLA = `4`, CS = `5`, 
         CC = `6`, MISS = `7`, LA = `8`, MB = `9`) %>%
  select(LOCUS, CAMP, FLGS, FLGN, FLA, CS, CC, MISS, LA, MB) %>%
  gather(Group, Ho, 2:10) %>%
  mutate(POP = ordered(Group, levels = pops)) %>%
  mutate(REGION = case_when(POP %in% c("FLA") ~ "SWATL",
                            POP %in% c("CAMP") ~ "SGULF",
                            POP %in% c("FLGS", "FLGN") ~ "EGULF",
                            POP %in% c("MB", "MISS", "CS", "LA") ~ "CGULF",
                            POP %in% c("CC") ~ "WGULF",),
         OCEAN = ifelse(POP == "FLA", "ATL", "GULF"))

# compare observed heterozygosity per locus and cluster
ggplot(Ho_per, aes(x = Group, y = Ho, fill = REGION)) +
  geom_boxplot() +
  labs(x = "Group", y = "Ho") +
  scale_fill_manual(values = col_regs) +
  theme_standard +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

```

Compare distribution of expected heterozygosity He, (within cluster diversity) in each putative population:

```{r fig.height=4, fig.width=5}

# within cluster diversity/exp diversity per locus (rows) and cluster (columns)
Hs_per <- as.data.frame(GenDiv2$Hs) %>%
  rownames_to_column("LOCUS") %>%
  rename(CAMP = `1`, FLGS = `2`, FLGN = `3`, FLA = `4`, CS = `5`, 
         CC = `6`, MISS = `7`, LA = `8`, MB = `9`) %>%
  select(LOCUS, CAMP, FLGS, FLGN, FLA, CS, CC, MISS, LA, MB) %>%
  gather(Group, Hs, 2:9)

# compare expected heterozygosity per locus and cluster
ggplot(Hs_per, aes(x = Group, y = Hs)) +
  geom_boxplot() +
  labs(x = "Group", y = "expected heterozygosity Ho") +
  theme_standard

```

Comparison of oberved vs. heterozygosity within each population:

```{r, fig.height=8, fig.width=12, message=FALSE, warning=FALSE}

# join data frames
Het_per <- left_join(Hs_per, Ho_per)

# plot per cluster
ggplot(Het_per, aes(x = Ho, y = Hs)) +
  geom_point(shape = 19, size = 1) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
  xlim(0, 1) +
  ylim(0, 1) +
  facet_grid(. ~ Group) +
  labs(x = "observed heterozygosity Ho", y = "within cluster diversity Hs (Hexp)") +
  theme_standard

```

Test for deviations from Hardy-Weinberg equilibrium within each group (POP level):

```{r}

setPop(gen) <- ~POP

popNames(gen)

# should be able to use lapply
HWE <- seppop(gen) %>%
   lapply(hw.test, B=1000)

HWE[["ALL"]] <- hw.test(gen, B = 1000)

# convert to data.frames
hwe <- list()

for (m in names(HWE)) {
   hwe[[m]] <- as.data.frame(HWE[[m]]) %>%
   rename(pval = Pr.exact) %>%
   rownames_to_column("LOCUS") %>%
   select(LOCUS, pval)
}

hwe <- ldply(hwe, data.frame) %>%
   rename(POP = `.id`) %>%
   mutate(HWE = ifelse(pval <= 0.05, "OUT", "IN"))

write_delim(hwe, "results/HWEbypop", delim = "\t")

hwe <- read_delim("results/HWEbypop", delim = "\t")

hwe_perpop <- hwe %>%
  group_by(POP) %>%
  count(HWE)

hwe_perloc <- hwe %>%
  filter(POP != "ALL") %>%
  group_by(LOCUS) %>%
  count(HWE) %>%
  filter(HWE == "OUT")

```

Format output and view results:

```{r HWE results, fig.height=3, fig.width=12, message=FALSE, warning=FALSE}

# plot data
ggplot(hwe_perpop, aes(x = HWE, y = n)) +
  geom_bar(stat = "identity", color = "black", fill = "darkorange") +
  labs(x = "significant deviation from HWE (p > 0.05)", y = "number of loci") +
  facet_grid( ~ POP) +
  theme_standard

```

Distribution of number of times loci are out of HWE

```{r fig.height=3, fig.width=4}

ggplot(hwe_perloc, aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") +
  labs(x = "no populations out of HWE", y = "no. loci") +
  theme_standard

```

Remove loci out of HWE in 5 or more populations:

```{r}

# remove flagged loci
removeloc <- hwe_perloc %>%
  filter(n >= 5)

gen <- genind.rem.loci(gen, removeloc$LOCUS)

gen

```

# Final data set

Write files with filtered data set:

```{r write filtered data set}

library(radiator)

setPop(gen) <- ~POP

# convert to tidy data set
tidy <- tidy_genomic_data(data = gen, filename = NULL) %>%
  dplyr::select(1:6)

# write tidy data set to file
write_delim(tidy, "data/POPGEN/BMA.tidy.genotypyes", delim = "\t")

# write genepop file
write_genepop(data = tidy,
              filename = "data/POPGEN/BMA_by_pop",
              genepop.header = "gafftop sail catfish grouped by population")

```

Write files with individuals and Contigs still contained in data set and use to filter vcf file.

```{r}

keeploci <- as.data.frame(locNames(gen)) %>%
  rename(LOCUS = `locNames(gen)`)

keeploci <- filter(loc_stats_F13, CHR %in% keeploci$LOCUS) %>%
  select(CHR, POS)

write.table(keeploci, "data/VCF/filtered.loci",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

keepind <- as.data.frame(indNames(gen)) %>%
  rename(LIB_ID = `indNames(gen)`) 

keepind <-filter(ind_stats_F13, INDV %in% keepind$LIB_ID) %>%
  select(INDV)

write.table(keepind, "data/VCF/filtered.ind",
            col.names = FALSE, row.names = FALSE, quote = FALSE)

```

Write vcf file and 012 format:

```{bash}

vcftools --vcf data/VCF/temp/BMA.F13.recode.vcf --out data/VCF/temp/BMA --positions data/VCF/filtered.loci --keep data/VCF/filtered.ind --recode --recode-INFO-all

vcftools --vcf data/VCF/temp/BMA.recode.vcf --out data/POPGEN/BMA --012

vcftools --vcf data/VCF/temp/BMA.recode.vcf --out data/VCF/BMA.fil --depth
vcftools --vcf data/VCF/temp/BMA.recode.vcf --out data/VCF/BMA.fil --site-mean-depth
vcftools --vcf data/VCF/temp/BMA.recode.vcf --out data/VCF/BMA.fil --missing-indv
vcftools --vcf data/VCF/temp/BMA.recode.vcf --out data/VCF/BMA.fil --missing-site
vcftools --vcf data/VCF/temp/BMA.recode.vcf --out data/VCF/BMA.fil --het
vcftools --vcf data/VCF/temp/BMA.recode.vcf --out data/VCF/BMA.fil --hardy
vcftools --vcf data/VCF/temp/BMA.recode.vcf --out data/VCF/BMA.fil --site-quality
vcftools --vcf data/VCF/temp/BMA.recode.vcf --out data/VCF/BMA.fil --geno-depth

```

Compare stats for final filtered data set:
    
```{r stats final, fig.height=20, fig.width=10, message=FALSE, warning=FALSE}

# load stats files ----
ind_stats_fil <- read.ind.stats(dir = "data/VCF", vcf = "BMA.fil") %>%
  separate(INDV, into = c("SP", "LIB", "SAMPLE_ID"), sep = "_", remove = FALSE)

loc_stats_fil <- read.loc.stats(dir = "data/VCF/", vcf = "BMA.fil")

View(ind_stats_fil)
View(hap_ind_stats)

# plot missing data per indv ----
p1 <- ggplot(ind_stats_fil, aes(x = MISS_BMA.fil)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "missing data per indv") +
  theme_standard

# plot Fis per indv ----
p2 <- ggplot(ind_stats_fil, aes(x = Fis_BMA.fil)) +
  geom_histogram(binwidth = .01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(Fis_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv") +
  theme_standard

# plot read depth per indv ----
p3 <- ggplot(ind_stats_fil, aes(x = MEAN_DEPTH_BMA.fil)) +
  geom_histogram(binwidth = 10, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per indv") +
  theme_standard

# plot depth vs missing ----
p4 <- ggplot(ind_stats_fil, aes(x = MEAN_DEPTH_BMA.fil, y = MISS_BMA.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per indv", y = "% missing data") +
  theme_standard

# plot Fis vs missing data per indv ----
p5 <- ggplot(ind_stats_fil, aes(x = Fis_BMA.fil, y = MISS_BMA.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.25),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "% missing data") +
  theme_standard

# plot Fis vs mean depth per indv ----
p6 <- ggplot(ind_stats_fil, aes(x = Fis_BMA.fil, y = MEAN_DEPTH_BMA.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(Fis_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "Fis per indv", y = "mean depth per indv") +
  theme_standard

# plot distribution missing data per locus ----
p7 <- ggplot(ind_stats_fil, aes(x = MISS_BMA.fil)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "% missing data per locus") +
  theme_standard

# plot distribution mean read depth ----
p8 <- ggplot(loc_stats_fil, aes(x = MEAN_DEPTH_BMA.fil)) +
  geom_histogram(binwidth = 5, color = "black", fill = "darkorange") +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean read depth per locus") +
  theme_standard

# plot read depth vs missing data ----
p9 <- ggplot(loc_stats_fil, aes(x = MEAN_DEPTH_BMA.fil, y = MISS_BMA.fil)) +
  geom_point() +
  geom_vline(aes(xintercept = mean(MEAN_DEPTH_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(MISS_BMA.fil, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 0.1),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "% missing data") +
  theme_standard

# plot no of SNPs per locus ----
p10 <- loc_stats_fil %>%
  count(CHR) %>%
  ggplot(aes(x = n)) +
  geom_histogram(binwidth = 1, color = "black", fill = "darkorange") + 
  labs(x = "number of SNPs per locus") +
  theme_standard

temp <- loc_stats_fil %>%
  count(CHR)

# plot number of SNPs per contig vs. mean depth ----
p11 <- left_join(temp, loc_stats_fil) %>%
  ggplot() +
  geom_point(aes(x = n, y = MEAN_DEPTH_BMA.fil)) +
  labs(x = "number of SNPs per contig", y = "mean depth") +
  theme_standard

# plot depth vs SNP quality ----
site_qual <- read.table("data/VCF/BMA.fil.lqual", 
                        header = TRUE, stringsAsFactors = FALSE) %>%
  mutate(PROB = 10^(-QUAL/10))

temp <- data.frame(loc_stats_fil$MEAN_DEPTH_BMA.fil, site_qual$QUAL) %>%
  rename(depth = loc_stats_fil.MEAN_DEPTH_BMA.fil, qual = site_qual.QUAL)

p12 <- ggplot(temp, aes(x = depth, y = qual)) +
  geom_point(size = 1) +
  geom_vline(aes(xintercept = mean(depth, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = mean(qual, na.rm = TRUE)),
                 color = "red", linetype = "dashed", size = 1) +
  geom_hline(aes(yintercept = 20),
                 color = "darkblue", linetype = "dashed", size = 1) +
  labs(x = "mean depth per locus", y = "SNP quality") +
  theme_standard

mfil <- multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols=2)

```
